From b47661028d45e4b1296f0c6ac4932b4d2ad6f1e4 Mon Sep 17 00:00:00 2001
From: Mikayla Gawarecki <mikaylagawarecki@gmail.com>
Date: Sun, 16 Nov 2025 21:28:27 -0800
Subject: [PATCH] Test that TORCH_FEATURE_VERSION guards are used where needed

[ghstack-poisoned]
---
 .../libtorch_agnostic_2_10/csrc/kernel.cpp    | 205 ------------
 .../make_tensor_clones_and_call_foreach.cpp   |  41 +++
 .../csrc/mv_tensor_accessor_cpu.cpp           |  40 +++
 .../csrc/mv_tensor_accessor_cuda.cu           |  47 +++
 .../csrc/my__foreach_mul.cpp                  |  20 ++
 .../csrc/my__foreach_mul_.cpp                 |  19 ++
 .../libtorch_agnostic_2_10/csrc/my_empty.cpp  |  25 ++
 .../csrc/my_reshape.cpp                       |  17 +
 .../libtorch_agnostic_2_10/csrc/my_view.cpp   |  20 ++
 .../csrc/tensor_accessor_kernel.h             |  28 ++
 .../csrc/test_device_constructor.cpp          |  37 +++
 .../csrc/test_device_equality.cpp             |  14 +
 .../csrc/test_device_index.cpp                |  14 +
 .../csrc/test_device_is_cpu.cpp               |  14 +
 .../csrc/test_device_is_cuda.cpp              |  14 +
 .../csrc/test_device_set_index.cpp            |  17 +
 .../csrc/test_get_num_threads.cpp             |  14 +
 .../csrc/test_parallel_for.cpp                |  49 +++
 .../csrc/test_tensor_device.cpp               |  17 +
 .../test_version_compatibility.py             | 300 ++++++++++++++++++
 20 files changed, 747 insertions(+), 205 deletions(-)
 delete mode 100644 test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/kernel.cpp
 create mode 100644 test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/make_tensor_clones_and_call_foreach.cpp
 create mode 100644 test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/mv_tensor_accessor_cpu.cpp
 create mode 100644 test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/mv_tensor_accessor_cuda.cu
 create mode 100644 test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my__foreach_mul.cpp
 create mode 100644 test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my__foreach_mul_.cpp
 create mode 100644 test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my_empty.cpp
 create mode 100644 test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my_reshape.cpp
 create mode 100644 test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my_view.cpp
 create mode 100644 test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/tensor_accessor_kernel.h
 create mode 100644 test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_constructor.cpp
 create mode 100644 test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_equality.cpp
 create mode 100644 test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_index.cpp
 create mode 100644 test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_is_cpu.cpp
 create mode 100644 test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_is_cuda.cpp
 create mode 100644 test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_set_index.cpp
 create mode 100644 test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_get_num_threads.cpp
 create mode 100644 test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_parallel_for.cpp
 create mode 100644 test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_tensor_device.cpp
 create mode 100644 test/cpp_extensions/libtorch_agnostic_2_10_extension/test_version_compatibility.py

diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/kernel.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/kernel.cpp
deleted file mode 100644
index 72c78984b5215..0000000000000
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/kernel.cpp
+++ /dev/null
@@ -1,205 +0,0 @@
-#include <torch/csrc/inductor/aoti_torch/c/shim.h>
-#include <torch/csrc/stable/accelerator.h>
-#include <torch/csrc/stable/device.h>
-#include <torch/csrc/stable/library.h>
-#include <torch/csrc/stable/tensor.h>
-#include <torch/csrc/stable/ops.h>
-#include <torch/headeronly/util/Exception.h>
-#include <torch/headeronly/core/ScalarType.h>
-
-#ifdef LAE_USE_CUDA
-#include <cuda_runtime.h>
-#endif
-
-#include <optional>
-
-using torch::stable::Tensor;
-
-std::vector<Tensor> my__foreach_mul(torch::headeronly::HeaderOnlyArrayRef<Tensor> self, torch::headeronly::HeaderOnlyArrayRef<Tensor> other) {
-  std::array<StableIValue, 2> stack = {torch::stable::detail::from(self), torch::stable::detail::from(other)};
-  aoti_torch_call_dispatcher("aten::_foreach_mul", "List", stack.data());
-  return torch::stable::detail::to<std::vector<Tensor>>(stack[0]);
-}
-
-void my__foreach_mul_(torch::headeronly::HeaderOnlyArrayRef<Tensor> self, torch::headeronly::HeaderOnlyArrayRef<Tensor> other) {
-  std::array<StableIValue, 2> stack = {torch::stable::detail::from(self), torch::stable::detail::from(other)};
-  aoti_torch_call_dispatcher("aten::_foreach_mul_", "List", stack.data());
-}
-
-Tensor my_clone(Tensor t) {
-  return clone(t);
-}
-
-std::vector<Tensor> make_tensor_clones_and_call_foreach(Tensor t1, Tensor t2) {
-  // This function tests that my__foreach_mul can take in std::initializer_lists
-  // in addition to std::vectors.
-  Tensor t1_1 = my_clone(t1);
-  Tensor t1_2 = my_clone(t1);
-  Tensor t2_1 = my_clone(t2);
-  Tensor t2_2 = my_clone(t2);
-  return my__foreach_mul({t1_1, t2_1}, {t1_2, t2_2});
-}
-
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
-  m.def("my__foreach_mul(Tensor[] self, Tensor[] other) -> Tensor[]");
-  m.def("my__foreach_mul_(Tensor(a!)[] self, Tensor[] other) -> ()");
-  m.def("make_tensor_clones_and_call_foreach(Tensor t1, Tensor t2) -> Tensor[]");
-}
-
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
-  m.impl("my__foreach_mul", TORCH_BOX(&my__foreach_mul));
-  m.impl("my__foreach_mul_", TORCH_BOX(&my__foreach_mul_));
-  m.impl("make_tensor_clones_and_call_foreach", TORCH_BOX(&make_tensor_clones_and_call_foreach));
-}
-
-// Test functions for torch::stable::Tensor device method
-
-torch::stable::Device test_tensor_device(torch::stable::Tensor tensor) {
-  return tensor.device();
-}
-
-// Test functions for torch::stable::Device
-
-torch::stable::Device test_device_constructor(
-    bool is_cuda,
-    torch::stable::DeviceIndex index,
-    bool use_str) {
-  using torch::stable::Device;
-  using torch::stable::DeviceType;
-
-  if (use_str) {
-    std::string device_str;
-    if (is_cuda) {
-      device_str = "cuda:" + std::to_string(index);
-    } else {
-      device_str = "cpu";
-    }
-    return Device(device_str);
-  } else {
-    if (is_cuda) {
-      return Device(DeviceType::CUDA, index);
-    } else {
-      return Device(DeviceType::CPU);
-    }
-  }
-}
-
-bool test_device_equality(torch::stable::Device d1, torch::stable::Device d2) {
-  return d1 == d2;
-}
-
-torch::stable::Device test_device_set_index(
-    torch::stable::Device device,
-    torch::stable::DeviceIndex index) {
-  device.set_index(index);
-  return device;
-}
-
-torch::stable::DeviceIndex test_device_index(torch::stable::Device device) {
-  return device.index();
-}
-
-bool test_device_is_cuda(torch::stable::Device device) {
-  return device.is_cuda();
-}
-
-bool test_device_is_cpu(torch::stable::Device device) {
-  return device.is_cpu();
-}
-
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
-  m.def("test_tensor_device(Tensor t) -> Device");
-  m.def(
-      "test_device_constructor(bool is_cuda, DeviceIndex index, bool use_str) -> Device");
-  m.def("test_device_equality(Device d1, Device d2) -> bool");
-  m.def("test_device_set_index(Device device, DeviceIndex index) -> Device");
-  m.def("test_device_index(Device device) -> DeviceIndex");
-  m.def("test_device_is_cuda(Device device) -> bool");
-  m.def("test_device_is_cpu(Device device) -> bool");
-}
-
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
-  m.impl("test_tensor_device", TORCH_BOX(&test_tensor_device));
-  m.impl("test_device_constructor", TORCH_BOX(&test_device_constructor));
-  m.impl("test_device_equality", TORCH_BOX(&test_device_equality));
-  m.impl("test_device_set_index", TORCH_BOX(&test_device_set_index));
-  m.impl("test_device_index", TORCH_BOX(&test_device_index));
-  m.impl("test_device_is_cuda", TORCH_BOX(&test_device_is_cuda));
-  m.impl("test_device_is_cpu", TORCH_BOX(&test_device_is_cpu));
-}
-
-Tensor test_parallel_for(int64_t size, int64_t grain_size) {
-  AtenTensorHandle tensor_handle;
-  int64_t stride = 1;
-
-  aoti_torch_empty_strided(
-      1,
-      &size,
-      &stride,
-      aoti_torch_dtype_int64(),
-      aoti_torch_device_type_cpu(),
-      0,
-      &tensor_handle);
-
-  Tensor tensor(tensor_handle);
-  int64_t* data_ptr = reinterpret_cast<int64_t*>(tensor.data_ptr());
-
-  torch::stable::zero_(tensor);
-
-  // Use parallel_for to fill each element with its index
-  // If using a parallel path, the thread id is encoded in the upper 32 bits
-  torch::stable::parallel_for(
-      0, size, grain_size, [data_ptr](int64_t begin, int64_t end) {
-        for (auto i = begin; i < end; i++) {
-          STD_TORCH_CHECK(i <= UINT32_MAX);
-          uint32_t thread_id;
-          torch_get_thread_idx(&thread_id);
-          data_ptr[i] = i | (static_cast<int64_t>(thread_id) << 32);
-        }
-      });
-
-  return tensor;
-}
-
-uint32_t test_get_num_threads() {
-  return torch::stable::get_num_threads();
-}
-
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
-  m.def("test_parallel_for(int size, int grain_size) -> Tensor");
-  m.def("test_get_num_threads() -> int");
-}
-
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
-  m.impl("test_parallel_for", TORCH_BOX(&test_parallel_for));
-  m.impl("test_get_num_threads", TORCH_BOX(&test_get_num_threads));
-}
-
-Tensor my_empty(
-    torch::headeronly::HeaderOnlyArrayRef<int64_t> size,
-    std::optional<torch::headeronly::ScalarType> dtype,
-    std::optional<torch::stable::Device> device,
-    std::optional<bool> pin_memory) {
-  return empty(size, dtype, device, pin_memory);
-}
-
-Tensor my_reshape(Tensor t, torch::headeronly::HeaderOnlyArrayRef<int64_t> shape) {
-  return reshape(t, shape);
-}
-
-Tensor my_view(Tensor t, torch::headeronly::HeaderOnlyArrayRef<int64_t> size) {
-  return view(t, size);
-}
-
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
-  m.def(
-      "my_empty(int[] size, ScalarType? dtype=None, Device? device=None, bool? pin_memory=None) -> Tensor");
-  m.def("my_reshape(Tensor t, int[] shape) -> Tensor");
-  m.def("my_view(Tensor t, int[] size) -> Tensor");
-}
-
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
-  m.impl("my_empty", TORCH_BOX(&my_empty));
-  m.impl("my_reshape", TORCH_BOX(&my_reshape));
-  m.impl("my_view", TORCH_BOX(&my_view));
-}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/make_tensor_clones_and_call_foreach.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/make_tensor_clones_and_call_foreach.cpp
new file mode 100644
index 0000000000000..d3dbab5891394
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/make_tensor_clones_and_call_foreach.cpp
@@ -0,0 +1,41 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/ops.h>
+#include <torch/csrc/stable/tensor.h>
+
+#include <vector>
+
+using torch::stable::Tensor;
+
+// Declare my__foreach_mul (defined in my__foreach_mul.cpp)
+extern std::vector<Tensor> my__foreach_mul(
+    torch::headeronly::HeaderOnlyArrayRef<Tensor> self,
+    torch::headeronly::HeaderOnlyArrayRef<Tensor> other);
+
+// Helper function for cloning
+Tensor my_clone(Tensor t) {
+  return clone(t);
+}
+
+std::vector<Tensor> make_tensor_clones_and_call_foreach(Tensor t1, Tensor t2) {
+  // This function tests that my__foreach_mul can take in std::initializer_lists
+  // in addition to std::vectors.
+  Tensor t1_1 = my_clone(t1);
+  Tensor t1_2 = my_clone(t1);
+  Tensor t2_1 = my_clone(t2);
+  Tensor t2_2 = my_clone(t2);
+  return my__foreach_mul({t1_1, t2_1}, {t1_2, t2_2});
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def(
+      "make_tensor_clones_and_call_foreach(Tensor t1, Tensor t2) -> Tensor[]");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(
+    libtorch_agnostic_2_10,
+    CompositeExplicitAutograd,
+    m) {
+  m.impl(
+      "make_tensor_clones_and_call_foreach",
+      TORCH_BOX(&make_tensor_clones_and_call_foreach));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/mv_tensor_accessor_cpu.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/mv_tensor_accessor_cpu.cpp
new file mode 100644
index 0000000000000..705439efffe63
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/mv_tensor_accessor_cpu.cpp
@@ -0,0 +1,40 @@
+// This is duplicated from the libtorch_agnostic_2_9_extension
+// as a negative test for test_version_compatibility.py
+
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/tensor.h>
+#include <torch/csrc/stable/ops.h>
+#include <torch/headeronly/util/Exception.h>
+#include <torch/headeronly/core/ScalarType.h>
+#include <torch/headeronly/core/Dispatch_v2.h>
+#include <torch/headeronly/core/TensorAccessor.h>
+
+#include "tensor_accessor_kernel.h"
+
+using torch::stable::Tensor;
+
+Tensor mv_tensor_accessor_cpu(Tensor m, Tensor v) {
+  STD_TORCH_CHECK(m.dim() == 2, "m must be 2D");
+  STD_TORCH_CHECK(v.dim() == 1, "v must be 1D");
+  STD_TORCH_CHECK(m.size(1) == v.size(0), "m.shape[1] == v.shape[0] must hold");
+  STD_TORCH_CHECK(m.scalar_type() == v.scalar_type(), "m and v must have the same dtype");
+  STD_TORCH_CHECK(m.device() == v.device(), "m and v must be on the same device");
+  Tensor res = new_empty(m, {m.size(0)});
+  THO_DISPATCH_V2(m.scalar_type(), "mv_tensor_accessor_cpu",
+                  AT_WRAP(([&]() {
+                    auto resa = Accessor_cpu<scalar_t, 1>(reinterpret_cast<scalar_t*>(res.data_ptr()), res.sizes().data(), res.strides().data());
+                    auto ma = Accessor_cpu<scalar_t, 2>(reinterpret_cast<scalar_t*>(m.data_ptr()), m.sizes().data(), m.strides().data());
+                    auto va = Accessor_cpu<scalar_t, 1>(reinterpret_cast<scalar_t*>(v.data_ptr()), v.sizes().data(), v.strides().data());
+                    mv_tensor_accessor_kernel<Accessor_cpu, scalar_t>(resa, ma, va);
+                  })),
+                  AT_FLOATING_TYPES);
+  return res;
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("mv_tensor_accessor_cpu(Tensor res, Tensor m, Tensor v) -> Tensor");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("mv_tensor_accessor_cpu", TORCH_BOX(&mv_tensor_accessor_cpu));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/mv_tensor_accessor_cuda.cu b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/mv_tensor_accessor_cuda.cu
new file mode 100644
index 0000000000000..7773210a089ee
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/mv_tensor_accessor_cuda.cu
@@ -0,0 +1,47 @@
+// This is duplicated from the libtorch_agnostic_2_9_extension
+// as a negative test for test_version_compatibility.py
+
+#include "tensor_accessor_kernel.h"
+
+#include <cuda_runtime.h>
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/ops.h>
+#include <torch/csrc/stable/tensor.h>
+
+using torch::stable::Tensor;
+
+Tensor mv_tensor_accessor_cuda(Tensor m, Tensor v) {
+  STD_TORCH_CHECK(m.dim() == 2, "m must be 2D");
+  STD_TORCH_CHECK(v.dim() == 1, "v must be 1D");
+  STD_TORCH_CHECK(m.size(1) == v.size(0), "m.shape[1] == v.shape[0] must hold");
+  STD_TORCH_CHECK(
+      m.scalar_type() == v.scalar_type(), "m and v must have the same dtype");
+  STD_TORCH_CHECK(
+      m.device() == v.device(), "m and v must be on the same device");
+  Tensor res = new_empty(m, {m.size(0)});
+  THO_DISPATCH_V2(
+      m.scalar_type(),
+      "mv_tensor_accessor_cuda",
+      AT_WRAP(([&]() {
+        auto resa = Accessor_cuda<scalar_t, 1>(
+            reinterpret_cast<scalar_t*>(res.data_ptr()),
+            res.sizes().data(),
+            res.strides().data());
+        auto ma = Accessor_cuda<scalar_t, 2>(
+            reinterpret_cast<scalar_t*>(m.data_ptr()),
+            m.sizes().data(),
+            m.strides().data());
+        auto va = Accessor_cuda<scalar_t, 1>(
+            reinterpret_cast<scalar_t*>(v.data_ptr()),
+            v.sizes().data(),
+            v.strides().data());
+        mv_tensor_accessor_kernel<Accessor_cuda, scalar_t>
+            <<<1, 1, 0, 0>>>(resa, ma, va);
+      })),
+      AT_FLOATING_TYPES);
+  return res;
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CUDA, m) {
+  m.impl("mv_tensor_accessor", TORCH_BOX(&mv_tensor_accessor_cuda));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my__foreach_mul.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my__foreach_mul.cpp
new file mode 100644
index 0000000000000..834a63afea646
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my__foreach_mul.cpp
@@ -0,0 +1,20 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/tensor.h>
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+#include <vector>
+
+using torch::stable::Tensor;
+
+std::vector<Tensor> my__foreach_mul(torch::headeronly::HeaderOnlyArrayRef<Tensor> self, torch::headeronly::HeaderOnlyArrayRef<Tensor> other) {
+  std::array<StableIValue, 2> stack = {torch::stable::detail::from(self), torch::stable::detail::from(other)};
+  aoti_torch_call_dispatcher("aten::_foreach_mul", "List", stack.data());
+  return torch::stable::detail::to<std::vector<Tensor>>(stack[0]);
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("my__foreach_mul(Tensor[] self, Tensor[] other) -> Tensor[]");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("my__foreach_mul", TORCH_BOX(&my__foreach_mul));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my__foreach_mul_.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my__foreach_mul_.cpp
new file mode 100644
index 0000000000000..8409e6890bdd0
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my__foreach_mul_.cpp
@@ -0,0 +1,19 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/tensor.h>
+#include <torch/csrc/stable/stableivalue_conversions.h>
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+
+using torch::stable::Tensor;
+
+void my__foreach_mul_(torch::headeronly::HeaderOnlyArrayRef<Tensor> self, torch::headeronly::HeaderOnlyArrayRef<Tensor> other) {
+  std::array<StableIValue, 2> stack = {torch::stable::detail::from(self), torch::stable::detail::from(other)};
+  aoti_torch_call_dispatcher("aten::_foreach_mul_", "List", stack.data());
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("my__foreach_mul_(Tensor(a!)[] self, Tensor[] other) -> ()");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("my__foreach_mul_", TORCH_BOX(&my__foreach_mul_));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my_empty.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my_empty.cpp
new file mode 100644
index 0000000000000..6278dca9f281d
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my_empty.cpp
@@ -0,0 +1,25 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/tensor.h>
+#include <torch/csrc/stable/device.h>
+#include <torch/csrc/stable/ops.h>
+
+#include <optional>
+
+using torch::stable::Tensor;
+
+Tensor my_empty(
+    torch::headeronly::HeaderOnlyArrayRef<int64_t> size,
+    std::optional<torch::headeronly::ScalarType> dtype,
+    std::optional<torch::stable::Device> device,
+    std::optional<bool> pin_memory) {
+  return empty(size, dtype, device, pin_memory);
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def(
+      "my_empty(int[] size, ScalarType? dtype=None, Device? device=None, bool? pin_memory=None) -> Tensor");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("my_empty", TORCH_BOX(&my_empty));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my_reshape.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my_reshape.cpp
new file mode 100644
index 0000000000000..0a2b1f70f2156
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my_reshape.cpp
@@ -0,0 +1,17 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/tensor.h>
+#include <torch/csrc/stable/ops.h>
+
+using torch::stable::Tensor;
+
+Tensor my_reshape(Tensor t, torch::headeronly::HeaderOnlyArrayRef<int64_t> shape) {
+  return reshape(t, shape);
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("my_reshape(Tensor t, int[] shape) -> Tensor");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("my_reshape", TORCH_BOX(&my_reshape));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my_view.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my_view.cpp
new file mode 100644
index 0000000000000..25d8c54589247
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my_view.cpp
@@ -0,0 +1,20 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/tensor.h>
+#include <torch/csrc/stable/ops.h>
+
+using torch::stable::Tensor;
+
+Tensor my_view(Tensor t, torch::headeronly::HeaderOnlyArrayRef<int64_t> size) {
+  return view(t, size);
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("my_view(Tensor t, int[] size) -> Tensor");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(
+    libtorch_agnostic_2_10,
+    CompositeExplicitAutograd,
+    m) {
+  m.impl("my_view", TORCH_BOX(&my_view));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/tensor_accessor_kernel.h b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/tensor_accessor_kernel.h
new file mode 100644
index 0000000000000..f1031f38060cf
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/tensor_accessor_kernel.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <torch/headeronly/core/Dispatch_v2.h>
+#include <torch/headeronly/core/TensorAccessor.h>
+
+template <typename T, size_t N>
+using Accessor_cpu = torch::headeronly::HeaderOnlyTensorAccessor<T, N>;
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+#define MAYBE_GLOBAL __global__
+
+template <typename T, size_t N>
+using Accessor_cuda = torch::headeronly::HeaderOnlyGenericPackedTensorAccessor<T, N, torch::headeronly::RestrictPtrTraits>;
+
+#else
+#define MAYBE_GLOBAL
+#endif
+
+template <template <typename, size_t> class Accessor, typename scalar_t>
+MAYBE_GLOBAL void mv_tensor_accessor_kernel(Accessor<scalar_t, 1> resa, Accessor<scalar_t, 2> ma, Accessor<scalar_t, 1> va) {
+  for (int64_t i = 0; i < resa.size(0); i++) {
+    scalar_t val = 0;
+    for (int64_t j = 0; j < ma.size(1); j++) {
+      val += ma[i][j] * va[j];
+    }
+    resa[i] = val;
+  }
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_constructor.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_constructor.cpp
new file mode 100644
index 0000000000000..67d5a300fc04a
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_constructor.cpp
@@ -0,0 +1,37 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/device.h>
+
+#include <string>
+
+torch::stable::Device test_device_constructor(
+    bool is_cuda,
+    torch::stable::DeviceIndex index,
+    bool use_str) {
+  using torch::stable::Device;
+  using torch::stable::DeviceType;
+
+  if (use_str) {
+    std::string device_str;
+    if (is_cuda) {
+      device_str = "cuda:" + std::to_string(index);
+    } else {
+      device_str = "cpu";
+    }
+    return Device(device_str);
+  } else {
+    if (is_cuda) {
+      return Device(DeviceType::CUDA, index);
+    } else {
+      return Device(DeviceType::CPU);
+    }
+  }
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def(
+      "test_device_constructor(bool is_cuda, DeviceIndex index, bool use_str) -> Device");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("test_device_constructor", TORCH_BOX(&test_device_constructor));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_equality.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_equality.cpp
new file mode 100644
index 0000000000000..247cd727175f6
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_equality.cpp
@@ -0,0 +1,14 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/device.h>
+
+bool test_device_equality(torch::stable::Device d1, torch::stable::Device d2) {
+  return d1 == d2;
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("test_device_equality(Device d1, Device d2) -> bool");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("test_device_equality", TORCH_BOX(&test_device_equality));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_index.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_index.cpp
new file mode 100644
index 0000000000000..dba40ea289e6d
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_index.cpp
@@ -0,0 +1,14 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/device.h>
+
+torch::stable::DeviceIndex test_device_index(torch::stable::Device device) {
+  return device.index();
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("test_device_index(Device device) -> DeviceIndex");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("test_device_index", TORCH_BOX(&test_device_index));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_is_cpu.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_is_cpu.cpp
new file mode 100644
index 0000000000000..58e1af91dfd50
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_is_cpu.cpp
@@ -0,0 +1,14 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/device.h>
+
+bool test_device_is_cpu(torch::stable::Device device) {
+  return device.is_cpu();
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("test_device_is_cpu(Device device) -> bool");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("test_device_is_cpu", TORCH_BOX(&test_device_is_cpu));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_is_cuda.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_is_cuda.cpp
new file mode 100644
index 0000000000000..e08709f30c2d7
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_is_cuda.cpp
@@ -0,0 +1,14 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/device.h>
+
+bool test_device_is_cuda(torch::stable::Device device) {
+  return device.is_cuda();
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("test_device_is_cuda(Device device) -> bool");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("test_device_is_cuda", TORCH_BOX(&test_device_is_cuda));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_set_index.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_set_index.cpp
new file mode 100644
index 0000000000000..a588db4d4e311
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_set_index.cpp
@@ -0,0 +1,17 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/device.h>
+
+torch::stable::Device test_device_set_index(
+    torch::stable::Device device,
+    torch::stable::DeviceIndex index) {
+  device.set_index(index);
+  return device;
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("test_device_set_index(Device device, DeviceIndex index) -> Device");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("test_device_set_index", TORCH_BOX(&test_device_set_index));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_get_num_threads.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_get_num_threads.cpp
new file mode 100644
index 0000000000000..0c16661830615
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_get_num_threads.cpp
@@ -0,0 +1,14 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/ops.h>
+
+uint32_t test_get_num_threads() {
+  return torch::stable::get_num_threads();
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("test_get_num_threads() -> int");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("test_get_num_threads", TORCH_BOX(&test_get_num_threads));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_parallel_for.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_parallel_for.cpp
new file mode 100644
index 0000000000000..3c4be2eb5f552
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_parallel_for.cpp
@@ -0,0 +1,49 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/tensor.h>
+#include <torch/csrc/stable/ops.h>
+#include <torch/csrc/stable/device.h>
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+#include <torch/csrc/inductor/aoti_torch/generated/c_shim_aten.h>
+
+using torch::stable::Tensor;
+
+Tensor test_parallel_for(int64_t size, int64_t grain_size) {
+  AtenTensorHandle tensor_handle;
+  int64_t stride = 1;
+
+  aoti_torch_empty_strided(
+      1,
+      &size,
+      &stride,
+      aoti_torch_dtype_int64(),
+      aoti_torch_device_type_cpu(),
+      0,
+      &tensor_handle);
+
+  Tensor tensor(tensor_handle);
+  int64_t* data_ptr = reinterpret_cast<int64_t*>(tensor.data_ptr());
+
+  torch::stable::zero_(tensor);
+
+  // Use parallel_for to fill each element with its index
+  // If using a parallel path, the thread id is encoded in the upper 32 bits
+  torch::stable::parallel_for(
+      0, size, grain_size, [data_ptr](int64_t begin, int64_t end) {
+        for (auto i = begin; i < end; i++) {
+          STD_TORCH_CHECK(i <= UINT32_MAX);
+          uint32_t thread_id;
+          torch_get_thread_idx(&thread_id);
+          data_ptr[i] = i | (static_cast<int64_t>(thread_id) << 32);
+        }
+      });
+
+  return tensor;
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("test_parallel_for(int size, int grain_size) -> Tensor");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("test_parallel_for", TORCH_BOX(&test_parallel_for));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_tensor_device.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_tensor_device.cpp
new file mode 100644
index 0000000000000..de00b6318a1a3
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_tensor_device.cpp
@@ -0,0 +1,17 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/tensor.h>
+#include <torch/csrc/stable/device.h>
+
+using torch::stable::Tensor;
+
+torch::stable::Device test_tensor_device(torch::stable::Tensor tensor) {
+  return tensor.device();
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("test_tensor_device(Tensor t) -> Device");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("test_tensor_device", TORCH_BOX(&test_tensor_device));
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/test_version_compatibility.py b/test/cpp_extensions/libtorch_agnostic_2_10_extension/test_version_compatibility.py
new file mode 100644
index 0000000000000..cdc187d892ed6
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/test_version_compatibility.py
@@ -0,0 +1,300 @@
+# Owner(s): ["module: cpp"]
+
+"""
+Unit tests to verify that each function file requires PyTorch 2.10+.
+
+This test suite compiles each .cpp file in the csrc directory with
+TORCH_TARGET_VERSION=2.9.0 and expects compilation to fail.
+If compilation succeeds, it means that either
+
+(1) The test function works with 2.9.0 and should not be in this directory.
+(2) The test function tests APIs that do not have proper TORCH_FEATURE_VERSION
+    guards. If this is the case, and you incorrectly move the test function into
+    libtorch_agnostic_2_9_extension the libtorch_agnostic_targetting CI workflow
+    will catch this.
+
+Run this script with VERSION_COMPAT_DEBUG=1 to see compilation errors.
+"""
+
+import os
+import subprocess
+import tempfile
+from pathlib import Path
+
+from torch.testing._internal.common_utils import run_tests, TestCase
+from torch.utils.cpp_extension import CUDA_HOME, include_paths as torch_include_paths
+
+
+class FunctionVersionCompatibilityTest(TestCase):
+    """Test that all function files require PyTorch 2.10+."""
+
+    @classmethod
+    def setUpClass(cls):
+        """Set up test environment once for all tests."""
+        cls.csrc_dir = Path(__file__).parent / "libtorch_agnostic_2_10" / "csrc"
+        cls.build_dir = Path(tempfile.mkdtemp(prefix="version_check_"))
+
+        cls.pytorch_includes = [
+            f"-I{path}" for path in torch_include_paths(device_type="cpu")
+        ]
+        cls.cuda_includes = []
+        if CUDA_HOME:
+            cuda_include_path = os.path.join(CUDA_HOME, "include")
+            if os.path.exists(cuda_include_path):
+                cls.cuda_includes = [f"-I{cuda_include_path}"]
+
+        cls.cuda_available = cls._check_cuda_available()
+
+    @classmethod
+    def tearDownClass(cls):
+        """Clean up build directory."""
+        import shutil
+
+        if cls.build_dir.exists():
+            shutil.rmtree(cls.build_dir)
+
+    @staticmethod
+    def _check_cuda_available() -> bool:
+        """Check if CUDA is available."""
+        try:
+            import torch
+
+            return torch.cuda.is_available()
+        except ImportError:
+            return False
+
+    def _compile_cpp_file(
+        self, source_file: Path, output_file: Path
+    ) -> tuple[bool, str]:
+        """
+        Compile a C++ file with TORCH_TARGET_VERSION=2.9.0.
+        Returns (success, error_message).
+        """
+        torch_version_2_9 = "0x0209000000000000"
+
+        cmd = [
+            "g++",
+            "-c",
+            "-std=c++17",
+            f"-DTORCH_TARGET_VERSION={torch_version_2_9}",
+            f"-I{source_file.parent}",  # For includes in same directory
+            *self.pytorch_includes,
+        ]
+
+        # Add CUDA flags if available
+        if self.cuda_available:
+            cmd.extend(self.cuda_includes)
+
+        cmd.extend([str(source_file), "-o", str(output_file)])
+
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
+
+        if result.returncode == 0:
+            return True, ""
+        else:
+            return False, result.stderr
+
+    def _compile_cu_file(
+        self, source_file: Path, output_file: Path
+    ) -> tuple[bool, str]:
+        """
+        Compile a CUDA file with TORCH_TARGET_VERSION=2.9.0.
+        Returns (success, error_message).
+        """
+        if not CUDA_HOME:
+            return False, "CUDA_HOME not set"
+
+        torch_version_2_9 = "0x0209000000000000"
+
+        cmd = [
+            os.path.join(CUDA_HOME, "bin", "nvcc"),
+            "-c",
+            "-std=c++17",
+            f"-DTORCH_TARGET_VERSION={torch_version_2_9}",
+            f"-I{source_file.parent}",  # For includes in same directory
+            *self.pytorch_includes,
+            *self.cuda_includes,
+        ]
+
+        cmd.extend([str(source_file), "-o", str(output_file)])
+
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
+
+        if result.returncode == 0:
+            return True, ""
+        else:
+            return False, result.stderr
+
+    def _test_function_file(self, source_file: Path):
+        """Test that a function file fails to compile with TORCH_TARGET_VERSION=2.9.0."""
+        func_name = source_file.stem
+        obj_file = self.build_dir / f"{func_name}.o"
+
+        # Choose the appropriate compiler based on file extension
+        if source_file.suffix == ".cu":
+            if not self.cuda_available:
+                self.skipTest(f"CUDA not available, skipping {source_file.name}")
+            success, error_msg = self._compile_cu_file(source_file, obj_file)
+        else:
+            success, error_msg = self._compile_cpp_file(source_file, obj_file)
+
+        obj_file.unlink(missing_ok=True)
+
+        # Print error details for debugging
+        if not success and os.environ.get("VERSION_COMPAT_DEBUG", "0") == "1":
+            relevant_errors = self._extract_relevant_errors(error_msg)
+            if relevant_errors:
+                print(f"\n  Compilation errors for {func_name} (requires 2.10+):")
+                for err in relevant_errors[:5]:
+                    print(f"    {err}")
+                if len(relevant_errors) > 5:
+                    print(f"    ... and {len(relevant_errors) - 5} more errors")
+
+        self.assertFalse(
+            success,
+            f"Function {func_name} compiled successfully with TORCH_TARGET_VERSION=2.9.0. "
+            f"This could mean two things.\n\t1. It should run with 2.9.0 and should be "
+            "moved to libtorch_agnostic_2_9_extension\n\t2. The function(s) it tests do not use the "
+            "proper TORCH_FEATURE_VERSION guards\n\nThe libtorch_agnostic_targetting CI workflow will "
+            "verify if you incorrectly move this to the 2_9 extension instead of adding "
+            "the appropriate version guards.",
+        )
+
+    def test_mv_tensor_accessor_cpu_works_with_2_9(self):
+        """Test that mv_tensor_accessor_cpu.cpp compiles successfully with 2.9.0.
+
+        This is a negative test - it ensures that a file we expect to work with 2.9.0
+        actually does compile. This validates that our test infrastructure correctly
+        distinguishes between files that require 2.10+ and those that don't.
+        """
+        cpp_file = self.csrc_dir / "mv_tensor_accessor_cpu.cpp"
+
+        if not cpp_file.exists():
+            self.skipTest(f"{cpp_file} not found - this is a test file only")
+
+        obj_file = self.build_dir / "mv_tensor_accessor_cpu.o"
+        success, error_msg = self._compile_cpp_file(cpp_file, obj_file)
+
+        # Clean up
+        obj_file.unlink(missing_ok=True)
+
+        if not success:
+            relevant_errors = self._extract_relevant_errors(error_msg)
+            if relevant_errors:
+                print("\n  Unexpected compilation errors for mv_tensor_accessor_cpu:")
+                for err in relevant_errors[:10]:
+                    print(f"{err}")
+
+        self.assertTrue(
+            success,
+            f"mv_tensor_accessor_cpu.cpp failed to compile with TORCH_TARGET_VERSION=2.9.0. "
+            f"This file is expected to work with 2.9.0 since it doesn't use 2.10+ features. "
+            f"Error: {error_msg[:500]}",
+        )
+
+    def test_mv_tensor_accessor_cuda_works_with_2_9(self):
+        """Test that mv_tensor_accessor_cuda.cu compiles successfully with 2.9.0.
+
+        This is a negative test - it ensures that a .cu file we expect to work with 2.9.0
+        actually does compile. This validates that our test infrastructure correctly
+        compiles CUDA files and distinguishes between files that require 2.10+ and those
+        that don't.
+        """
+        if not self.cuda_available:
+            self.skipTest(
+                "CUDA not available, skipping mv_tensor_accessor_cuda.cu test"
+            )
+
+        cu_file = self.csrc_dir / "mv_tensor_accessor_cuda.cu"
+
+        if not cu_file.exists():
+            self.skipTest(f"{cu_file} not found - this is a test file only")
+
+        obj_file = self.build_dir / "cuda_kernel.o"
+        success, error_msg = self._compile_cu_file(cu_file, obj_file)
+
+        # Clean up
+        obj_file.unlink(missing_ok=True)
+
+        if not success:
+            relevant_errors = self._extract_relevant_errors(error_msg)
+            if relevant_errors:
+                print(
+                    "\n  Unexpected compilation errors for mv_tensor_accessor_cuda.cu:"
+                )
+                for err in relevant_errors[:10]:
+                    print(f"{err}")
+
+        self.assertTrue(
+            success,
+            f"mv_tensor_accessor_cuda.cu failed to compile with TORCH_TARGET_VERSION=2.9.0. "
+            f"This file is expected to work with 2.9.0 since it doesn't use 2.10+ features. "
+            f"Error: {error_msg[:500]}",
+        )
+
+    @staticmethod
+    def _extract_relevant_errors(error_msg: str) -> list[str]:
+        """Extract the most relevant error messages."""
+        error_lines = error_msg.strip().split("\n")
+        relevant_errors = []
+
+        for line in error_lines:
+            line_lower = line.lower()
+            if (
+                "error:" in line_lower
+                or "undefined" in line_lower
+                or "undeclared" in line_lower
+                or "no member named" in line_lower
+            ):
+                relevant_errors.append(line.strip())
+
+        return relevant_errors
+
+
+# Dynamically create test methods for each .cpp and .cu file
+def _create_test_method_for_file(source_file: Path):
+    """Create a test method for a specific source file."""
+
+    def test_method_impl(self):
+        self._test_function_file(source_file)
+
+    # Set a descriptive name and docstring
+    func_name = source_file.stem
+    file_ext = source_file.suffix
+    test_method_impl.__name__ = f"test_{func_name}_requires_2_10"
+    test_method_impl.__doc__ = f"Test that {func_name}{file_ext} requires PyTorch 2.10+"
+
+    return test_method_impl
+
+
+# Test discovery: generate a test for each .cpp and .cu file
+_csrc_dir = Path(__file__).parent / "libtorch_agnostic_2_10" / "csrc"
+if _csrc_dir.exists():
+    # Collect both .cpp and .cu files, excluding those used for negative tests
+    _source_files = sorted(
+        [
+            f
+            for f in _csrc_dir.rglob("*.cpp")
+            if f.name not in ("mv_tensor_accessor_cpu.cpp",)
+        ]
+        + [
+            f
+            for f in _csrc_dir.rglob("*.cu")
+            if f.name not in ("mv_tensor_accessor_cuda.cu",)
+        ]
+    )
+
+    for _source_file in _source_files:
+        _test_method = _create_test_method_for_file(_source_file)
+        setattr(FunctionVersionCompatibilityTest, _test_method.__name__, _test_method)
+
+    del (
+        _create_test_method_for_file,
+        _csrc_dir,
+        _source_files,
+        _source_file,
+        _test_method,
+    )
+
+if __name__ == "__main__":
+    run_tests()