diff --git a/.ci/scripts/setup-windows.ps1 b/.ci/scripts/setup-windows.ps1
index 329e81b3cf0..20d29e4f558 100644
--- a/.ci/scripts/setup-windows.ps1
+++ b/.ci/scripts/setup-windows.ps1
@@ -1,5 +1,5 @@
 param (
-    [string]$editable = "false"
+    [string]$editable = $false
 )
 
 conda create --yes --quiet -n et python=3.12
diff --git a/.ci/scripts/unittest-windows.ps1 b/.ci/scripts/unittest-windows.ps1
index 6f1365bc3fc..65ed303051b 100644
--- a/.ci/scripts/unittest-windows.ps1
+++ b/.ci/scripts/unittest-windows.ps1
@@ -1,38 +1,15 @@
 param (
-    [string]$buildMode = "Release"
+    [string]$editable = $false
 )
 
 Set-PSDebug -Trace 1
 $ErrorActionPreference = 'Stop'
 $PSNativeCommandUseErrorActionPreference = $true
 
-# Run native unit tests (via ctest)
-New-Item -Path "test-build" -ItemType Directory
-cd "test-build"
-
-cmake .. --preset windows -B . -DEXECUTORCH_BUILD_TESTS=ON -DCMAKE_BUILD_TYPE=$buildMode
-if ($LASTEXITCODE -ne 0) {
-    Write-Host "CMake configuration was unsuccessful. Exit code: $LASTEXITCODE."
-    exit $LASTEXITCODE
-}
-
-cmake --build . -j8 --config $buildMode --verbose
-if ($LASTEXITCODE -ne 0) {
-    Write-Host "CMake build was unsuccessful. Exit code: $LASTEXITCODE."
-    exit $LASTEXITCODE
-}
-
-ctest -j8 . --build-config $buildMode --output-on-failure -E "method_test|tensor_parser_test"
-if ($LASTEXITCODE -ne 0) {
-    Write-Host "CTest run was unsuccessful. Exit code: $LASTEXITCODE."
-    exit $LASTEXITCODE
-}
-
-cd ..
-
-# Run pytest
-pytest -v -c pytest-windows.ini
+# Run pytest with coverage
+# pytest -n auto --cov=./ --cov-report=xml
+pytest -v --full-trace -c pytest-windows.ini
 if ($LASTEXITCODE -ne 0) {
     Write-Host "Pytest invocation was unsuccessful. Exit code: $LASTEXITCODE."
     exit $LASTEXITCODE
-}
\ No newline at end of file
+}
diff --git a/.github/workflows/_unittest.yml b/.github/workflows/_unittest.yml
index 587f2cf5e5a..a619b33dd2e 100644
--- a/.github/workflows/_unittest.yml
+++ b/.github/workflows/_unittest.yml
@@ -69,15 +69,7 @@ jobs:
           \$ErrorActionPreference = 'Stop'
           \$PSNativeCommandUseErrorActionPreference = \$true
 
-          .ci/scripts/setup-windows.ps1 -editable "${{ inputs.editable }}"
-          if (\$LASTEXITCODE -ne 0) {
-              Write-Host "Setup failed. Exit code: \$LASTEXITCODE."
-              exit \$LASTEXITCODE
-          }
+          .ci/scripts/setup-windows.ps1       
 
-          .ci/scripts/unittest-windows.ps1 -buildMode "${{ inputs.build-mode }}"
-          if (\$LASTEXITCODE -ne 0) {
-              Write-Host "Unit tests failed. Exit code: \$LASTEXITCODE."
-              exit \$LASTEXITCODE
-          }
+          powershell .ci/scripts/unittest-windows.ps1 -editable "${{ inputs.editable }}"
         }"
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 975a8ebbb30..f5c5161e0cc 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -1032,5 +1032,5 @@ jobs:
 
           .ci/scripts/setup-windows.ps1
 
-          .ci/scripts/test_model.ps1 -modelName ${{ matrix.model }} -backend ${{ matrix.backend }}
+          powershell .ci/scripts/test_model.ps1 -modelName ${{ matrix.model }} -backend ${{ matrix.backend }}
         }"
diff --git a/CMakeLists.txt b/CMakeLists.txt
index fc427d517a9..2664b4491c9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -143,13 +143,9 @@ endif()
 
 # -ffunction-sections -fdata-sections: breaks function and data into sections so
 # they can be properly gc'd. -s: strip symbol.
-if(WIN32)
-  set(CMAKE_CXX_FLAGS_RELEASE "/Gy /Gw ${CMAKE_CXX_FLAGS_RELEASE}")
-else()
-  set(CMAKE_CXX_FLAGS_RELEASE
-      "-ffunction-sections -fdata-sections ${CMAKE_CXX_FLAGS_RELEASE}"
-  )
-endif()
+set(CMAKE_CXX_FLAGS_RELEASE
+    "-ffunction-sections -fdata-sections ${CMAKE_CXX_FLAGS_RELEASE}"
+)
 if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
   set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -s")
 endif()
diff --git a/backends/xnnpack/CMakeLists.txt b/backends/xnnpack/CMakeLists.txt
index 33bf84b9066..200d8987b19 100644
--- a/backends/xnnpack/CMakeLists.txt
+++ b/backends/xnnpack/CMakeLists.txt
@@ -59,7 +59,7 @@ foreach(fbs_file ${_xnnpack_schema__srcs})
   )
 endforeach()
 
-if(CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
+if(WIN32 AND NOT CMAKE_CROSSCOMPILING)
   set(MV_COMMAND
       powershell -Command
       "Move-Item -Path ${_xnnpack_flatbuffer__outputs} -Destination ${_xnnpack_schema__outputs} -Force"
diff --git a/export/target_recipes.py b/export/target_recipes.py
index 0a5ae9ce754..76e0cacc7b4 100644
--- a/export/target_recipes.py
+++ b/export/target_recipes.py
@@ -11,32 +11,26 @@
 selection and combine multiple backends optimally for target hardware.
 """
 
-import sys
 from typing import Dict, List
 
-if sys.platform != "win32":
-    import coremltools as ct
-    from executorch.backends.apple.coreml.recipes import CoreMLRecipeType
+import coremltools as ct
 
 # pyre-ignore
+from executorch.backends.apple.coreml.recipes import CoreMLRecipeType
 from executorch.backends.xnnpack.recipes import XNNPackRecipeType
 from executorch.export.recipe import ExportRecipe, RecipeType
 
 
 ## IOS Target configs
 # The following list of recipes are not exhaustive for CoreML; refer to CoreMLRecipeType for more detailed recipes.
-IOS_CONFIGS: Dict[str, List[RecipeType]] = (
-    {
-        # pyre-ignore
-        "ios-arm64-coreml-fp32": [CoreMLRecipeType.FP32, XNNPackRecipeType.FP32],
-        # pyre-ignore
-        "ios-arm64-coreml-fp16": [CoreMLRecipeType.FP16],
-        # pyre-ignore
-        "ios-arm64-coreml-int8": [CoreMLRecipeType.PT2E_INT8_STATIC],
-    }
-    if sys.platform != "win32"
-    else {}
-)
+IOS_CONFIGS: Dict[str, List[RecipeType]] = {
+    # pyre-ignore
+    "ios-arm64-coreml-fp32": [CoreMLRecipeType.FP32, XNNPackRecipeType.FP32],
+    # pyre-ignore
+    "ios-arm64-coreml-fp16": [CoreMLRecipeType.FP16],
+    # pyre-ignore
+    "ios-arm64-coreml-int8": [CoreMLRecipeType.PT2E_INT8_STATIC],
+}
 
 
 def _create_target_recipe(
diff --git a/export/tests/test_target_recipes.py b/export/tests/test_target_recipes.py
index 7a2a7c87342..d781ffea945 100644
--- a/export/tests/test_target_recipes.py
+++ b/export/tests/test_target_recipes.py
@@ -7,10 +7,10 @@
 # pyre-strict
 
 import logging
-import sys
 import unittest
 
 import torch
+from executorch.backends.apple.coreml.recipes import CoreMLRecipeProvider  # pyre-ignore
 from executorch.backends.xnnpack.recipes.xnnpack_recipe_provider import (
     XNNPACKRecipeProvider,
 )
@@ -18,11 +18,6 @@
 from executorch.export.target_recipes import get_ios_recipe
 from executorch.runtime import Runtime
 
-if sys.platform != "win32":
-    from executorch.backends.apple.coreml.recipes import (  # pyre-ignore
-        CoreMLRecipeProvider,
-    )
-
 
 class TestTargetRecipes(unittest.TestCase):
     """Test target recipes."""
@@ -31,14 +26,12 @@ def setUp(self) -> None:
         torch._dynamo.reset()
         super().setUp()
         recipe_registry.register_backend_recipe_provider(XNNPACKRecipeProvider())
-        if sys.platform != "win32":
-            # pyre-ignore
-            recipe_registry.register_backend_recipe_provider(CoreMLRecipeProvider())
+        # pyre-ignore
+        recipe_registry.register_backend_recipe_provider(CoreMLRecipeProvider())
 
     def tearDown(self) -> None:
         super().tearDown()
 
-    @unittest.skipIf(sys.platform == "win32", "Core ML is not available on Windows.")
     def test_ios_fp32_recipe_with_xnnpack_fallback(self) -> None:
         # Linear ops skipped by coreml but handled by xnnpack
         class Model(torch.nn.Module):
@@ -114,7 +107,6 @@ def forward(self, x, y):
             et_output = session.run_method("forward", example_inputs[0])
             logging.info(f"et output {et_output}")
 
-    @unittest.skipIf(sys.platform == "win32", "Core ML is not available on Windows.")
     def test_ios_quant_recipes(self) -> None:
         class Model(torch.nn.Module):
             def __init__(self):
diff --git a/extension/evalue_util/test/print_evalue_test.cpp b/extension/evalue_util/test/print_evalue_test.cpp
index 242cb0af224..b881e55d8a8 100644
--- a/extension/evalue_util/test/print_evalue_test.cpp
+++ b/extension/evalue_util/test/print_evalue_test.cpp
@@ -267,7 +267,7 @@ TEST(PrintEvalueTest, UnelidedBoolLists) {
   // case; the other scalar types use the same underlying code, so they don't
   // need to test this again.
   {
-    EValue value(ArrayRef<bool>(list.data(), static_cast<size_t>(0ul)));
+    EValue value(ArrayRef<bool>(list.data(), 0ul));
     expect_output(value, "(len=0)[]");
   }
   {
@@ -419,7 +419,7 @@ TEST(PrintEvalueTest, UnelidedDoubleLists) {
   std::array<double, 6> list = {-2.2, -1, 0, INFINITY, NAN, 3.3};
 
   {
-    EValue value(ArrayRef<double>(list.data(), static_cast<size_t>(0ul)));
+    EValue value(ArrayRef<double>(list.data(), 0ul));
     expect_output(value, "(len=0)[]");
   }
   {
diff --git a/extension/flat_tensor/test/CMakeLists.txt b/extension/flat_tensor/test/CMakeLists.txt
index fd3d6792f90..c3296dc61f3 100644
--- a/extension/flat_tensor/test/CMakeLists.txt
+++ b/extension/flat_tensor/test/CMakeLists.txt
@@ -23,7 +23,7 @@ add_custom_command(
          "${CMAKE_CURRENT_BINARY_DIR}/ModuleAddMulProgram.ptd"
   COMMAND
     ${PYTHON_EXECUTABLE} -m test.models.export_program --modules "ModuleAddMul"
-    --external-constants --outdir "${CMAKE_CURRENT_BINARY_DIR}"
+    --external-constants --outdir "${CMAKE_CURRENT_BINARY_DIR}" 2> /dev/null
   WORKING_DIRECTORY ${EXECUTORCH_ROOT}
 )
 
diff --git a/extension/module/test/CMakeLists.txt b/extension/module/test/CMakeLists.txt
index 1c4358dd73e..964b810eed5 100644
--- a/extension/module/test/CMakeLists.txt
+++ b/extension/module/test/CMakeLists.txt
@@ -24,10 +24,10 @@ add_custom_command(
          "${CMAKE_CURRENT_BINARY_DIR}/ModuleAddMulProgram.pte"
          "${CMAKE_CURRENT_BINARY_DIR}/ModuleAddMulProgram.ptd"
   COMMAND ${PYTHON_EXECUTABLE} -m test.models.export_program --modules
-          "ModuleAdd" --outdir "${CMAKE_CURRENT_BINARY_DIR}"
+          "ModuleAdd" --outdir "${CMAKE_CURRENT_BINARY_DIR}" 2> /dev/null
   COMMAND
     ${PYTHON_EXECUTABLE} -m test.models.export_program --modules "ModuleAddMul"
-    --external-constants --outdir "${CMAKE_CURRENT_BINARY_DIR}"
+    --external-constants --outdir "${CMAKE_CURRENT_BINARY_DIR}" 2> /dev/null
   WORKING_DIRECTORY ${EXECUTORCH_ROOT}
 )
 
diff --git a/extension/runner_util/test/CMakeLists.txt b/extension/runner_util/test/CMakeLists.txt
index 44b85a7fced..0cca06178cd 100644
--- a/extension/runner_util/test/CMakeLists.txt
+++ b/extension/runner_util/test/CMakeLists.txt
@@ -20,7 +20,7 @@ include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
 add_custom_command(
   OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/ModuleAdd.pte"
   COMMAND ${PYTHON_EXECUTABLE} -m test.models.export_program --modules
-          "ModuleAdd" --outdir "${CMAKE_CURRENT_BINARY_DIR}"
+          "ModuleAdd" --outdir "${CMAKE_CURRENT_BINARY_DIR}" 2> /dev/null
   WORKING_DIRECTORY ${EXECUTORCH_ROOT}
 )
 
diff --git a/extension/testing_util/temp_file.h b/extension/testing_util/temp_file.h
index 4edaf2135d8..aa8f5bcc82e 100644
--- a/extension/testing_util/temp_file.h
+++ b/extension/testing_util/temp_file.h
@@ -9,11 +9,13 @@
 #pragma once
 
 #include <array>
-#include <fstream>
 #include <memory>
 #include <string>
 
 #include <fcntl.h> // open()
+#include <stdio.h> // tmpnam(), remove()
+#include <unistd.h> // write(), close()
+
 #include <gtest/gtest.h>
 
 namespace executorch {
@@ -70,13 +72,19 @@ class TempFile {
     }
 
     // Write the contents to the file.
-    std::ofstream file(path, std::ios::out | std::ios::binary);
-    ASSERT_TRUE(file.is_open())
-        << "open(" << path << ") failed: " << strerror(errno);
-
-    file.write((const char*)data, size);
-    ASSERT_TRUE(file.good())
-        << "Failed to write " << size << " bytes: " << strerror(errno);
+    int fd = open(
+        path.c_str(),
+        // O_EXCL ensures that we are the ones creating this file, to help
+        // protect against race conditions.
+        O_CREAT | O_EXCL | O_RDWR,
+        // User can read and write, group can read.
+        S_IRUSR | S_IWUSR | S_IRGRP);
+    ASSERT_GE(fd, 0) << "open(" << path << ") failed: " << strerror(errno);
+
+    ssize_t nwrite = write(fd, data, size);
+    ASSERT_EQ(nwrite, size) << "Failed to write " << size << " bytes (wrote "
+                            << nwrite << "): " << strerror(errno);
+    close(fd);
 
     *out_path = path;
   }
diff --git a/kernels/portable/cpu/op_argmax.cpp b/kernels/portable/cpu/op_argmax.cpp
index a48c152133b..e9a561366f7 100644
--- a/kernels/portable/cpu/op_argmax.cpp
+++ b/kernels/portable/cpu/op_argmax.cpp
@@ -49,13 +49,13 @@ Tensor& argmax_out(
   static constexpr const char op_name[] = "argmax.out";
 
   ET_SWITCH_REALHBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE, [&] {
-    int64_t* out_data = out.mutable_data_ptr<int64_t>();
+    long* out_data = out.mutable_data_ptr<long>();
 
     const bool success = parallel_for_each_reduce_over_dim_output_index(
         in, dim, out, [&](const auto begin, const auto end) {
           for (const auto out_ix : c10::irange(begin, end)) {
-            std::tuple<CTYPE, int64_t> acc = reduce_over_dim<CTYPE>(
-                [](CTYPE v, int64_t ix, CTYPE acc_val, int64_t acc_ix) {
+            std::tuple<CTYPE, long> acc = reduce_over_dim<CTYPE>(
+                [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) {
                   // the below condition as written is equivalent to
                   // !isnan(accval) && (isnan(v) || v > acc_val). See
                   // argument in op_argmin.cpp.
@@ -63,7 +63,7 @@ Tensor& argmax_out(
                     acc_val = v;
                     acc_ix = ix;
                   }
-                  return std::tuple<CTYPE, int64_t>{acc_val, acc_ix};
+                  return std::tuple<CTYPE, long>{acc_val, acc_ix};
                 },
                 in,
                 dim,
diff --git a/kernels/portable/cpu/op_argmin.cpp b/kernels/portable/cpu/op_argmin.cpp
index 55f2f82b04b..fda9463c5ee 100644
--- a/kernels/portable/cpu/op_argmin.cpp
+++ b/kernels/portable/cpu/op_argmin.cpp
@@ -49,13 +49,13 @@ Tensor& argmin_out(
   static constexpr const char op_name[] = "argmin.out";
 
   ET_SWITCH_REALHBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE, [&] {
-    int64_t* out_data = out.mutable_data_ptr<int64_t>();
+    long* out_data = out.mutable_data_ptr<long>();
 
     const bool success = parallel_for_each_reduce_over_dim_output_index(
         in, dim, out, [&](const auto begin, const auto end) {
           for (const auto out_ix : c10::irange(begin, end)) {
-            std::tuple<CTYPE, int64_t> acc = reduce_over_dim<CTYPE>(
-                [](CTYPE v, int64_t ix, CTYPE acc_val, int64_t acc_ix) {
+            std::tuple<CTYPE, long> acc = reduce_over_dim<CTYPE>(
+                [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) {
                   // the below condition as written is equivalent to
                   // !isnan(accval) && (isnan(v) || v < acc_val). cases:
                   // - if neither acc_val nor v is NaN, !(v >= acc_val) is
@@ -70,7 +70,7 @@ Tensor& argmin_out(
                     acc_val = v;
                     acc_ix = ix;
                   }
-                  return std::tuple<CTYPE, int64_t>{acc_val, acc_ix};
+                  return std::tuple<CTYPE, long>{acc_val, acc_ix};
                 },
                 in,
                 dim,
diff --git a/kernels/portable/cpu/op_clamp.cpp b/kernels/portable/cpu/op_clamp.cpp
index 8ac78fd5477..b3aa41cda85 100644
--- a/kernels/portable/cpu/op_clamp.cpp
+++ b/kernels/portable/cpu/op_clamp.cpp
@@ -45,9 +45,9 @@ ET_NODISCARD bool check_bounds(
   static constexpr const char op_name[] = "clamp.out";
 
   if (isIntegralType(out_type, /*includeBool=*/false)) {
-    const int64_t val_long = utils::scalar_to<int64_t>(val_scalar);
+    const long val_long = utils::scalar_to<long>(val_scalar);
     ET_SWITCH_INT_TYPES(out_type, ctx, op_name, CTYPE_OUT, [&]() {
-      if (is_out_of_bounds<CTYPE_OUT, int64_t>(val_long)) {
+      if (is_out_of_bounds<CTYPE_OUT, long>(val_long)) {
         ET_LOG(Error, "%s value out of bounds", val_name);
         is_valid = false;
       }
diff --git a/kernels/portable/cpu/op_gather.cpp b/kernels/portable/cpu/op_gather.cpp
index 02ea502ca63..9899c21a94e 100644
--- a/kernels/portable/cpu/op_gather.cpp
+++ b/kernels/portable/cpu/op_gather.cpp
@@ -30,7 +30,7 @@ void gather_helper(
     Tensor& out,
     int64_t dim) {
   const CTYPE* in_data = in.const_data_ptr<CTYPE>();
-  const int64_t* index_data = index.const_data_ptr<int64_t>();
+  const long* index_data = index.const_data_ptr<long>();
   CTYPE* out_data = out.mutable_data_ptr<CTYPE>();
 
   if (index.dim() == 0) {
diff --git a/kernels/portable/cpu/op_max.cpp b/kernels/portable/cpu/op_max.cpp
index 467c8ccffd5..7df93470d39 100644
--- a/kernels/portable/cpu/op_max.cpp
+++ b/kernels/portable/cpu/op_max.cpp
@@ -82,19 +82,19 @@ std::tuple<Tensor&, Tensor&> max_out(
   ET_SWITCH_REALHBBF16_TYPES(
       in.scalar_type(), ctx, "max.dim_max", CTYPE, [&]() {
         CTYPE* max_data = max.mutable_data_ptr<CTYPE>();
-        int64_t* max_indices_data = max_indices.mutable_data_ptr<int64_t>();
+        long* max_indices_data = max_indices.mutable_data_ptr<long>();
 
         const bool success = parallel_for_each_reduce_over_dim_output_index(
             in, dim, max, [&](const auto begin, const auto end) {
               for (const auto out_ix : c10::irange(begin, end)) {
-                std::tuple<CTYPE, int64_t> acc = reduce_over_dim<CTYPE>(
-                    [](CTYPE v, int64_t ix, CTYPE acc_val, int64_t acc_ix) {
+                std::tuple<CTYPE, long> acc = reduce_over_dim<CTYPE>(
+                    [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) {
                       if (!utils::isnan_override(acc_val) &&
                           (utils::isnan_override(v) || v > acc_val)) {
                         acc_val = v;
                         acc_ix = ix;
                       }
-                      return std::tuple<CTYPE, int64_t>{acc_val, acc_ix};
+                      return std::tuple<CTYPE, long>{acc_val, acc_ix};
                     },
                     in,
                     dim,
diff --git a/kernels/portable/cpu/op_min.cpp b/kernels/portable/cpu/op_min.cpp
index 304321bb9f8..a4cd1be2067 100644
--- a/kernels/portable/cpu/op_min.cpp
+++ b/kernels/portable/cpu/op_min.cpp
@@ -82,19 +82,19 @@ std::tuple<Tensor&, Tensor&> min_out(
   ET_SWITCH_REALHBBF16_TYPES(
       in.scalar_type(), ctx, "min.dim_min", CTYPE, [&]() {
         CTYPE* min_data = min.mutable_data_ptr<CTYPE>();
-        int64_t* min_indices_data = min_indices.mutable_data_ptr<int64_t>();
+        long* min_indices_data = min_indices.mutable_data_ptr<long>();
 
         const bool success = parallel_for_each_reduce_over_dim_output_index(
             in, dim, min, [&](const auto begin, const auto end) {
               for (const auto out_ix : c10::irange(begin, end)) {
-                std::tuple<CTYPE, int64_t> acc = reduce_over_dim<CTYPE>(
-                    [](CTYPE v, int64_t ix, CTYPE acc_val, int64_t acc_ix) {
+                std::tuple<CTYPE, long> acc = reduce_over_dim<CTYPE>(
+                    [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) {
                       if (!utils::isnan_override(acc_val) &&
                           (utils::isnan_override(v) || v < acc_val)) {
                         acc_val = v;
                         acc_ix = ix;
                       }
-                      return std::tuple<CTYPE, int64_t>{acc_val, acc_ix};
+                      return std::tuple<CTYPE, long>{acc_val, acc_ix};
                     },
                     in,
                     dim,
diff --git a/kernels/portable/cpu/op_scatter.cpp b/kernels/portable/cpu/op_scatter.cpp
index 42d40c8284d..58341cefb1e 100644
--- a/kernels/portable/cpu/op_scatter.cpp
+++ b/kernels/portable/cpu/op_scatter.cpp
@@ -32,7 +32,7 @@ void scatter_src_helper(
     const Tensor& src,
     Tensor& out) {
   const CTYPE* in_data = in.const_data_ptr<CTYPE>();
-  const int64_t* index_data = index.const_data_ptr<int64_t>();
+  const long* index_data = index.const_data_ptr<long>();
   const CTYPE* src_data = src.const_data_ptr<CTYPE>();
   CTYPE* out_data = out.mutable_data_ptr<CTYPE>();
 
@@ -72,7 +72,7 @@ void scatter_value_helper(
     CTYPE_VAL val,
     Tensor& out) {
   const CTYPE* in_data = in.const_data_ptr<CTYPE>();
-  const int64_t* index_data = index.const_data_ptr<int64_t>();
+  const long* index_data = index.const_data_ptr<long>();
   CTYPE* out_data = out.mutable_data_ptr<CTYPE>();
 
   memcpy(out_data, in_data, in.nbytes());
diff --git a/kernels/portable/cpu/op_scatter_add.cpp b/kernels/portable/cpu/op_scatter_add.cpp
index 690c31342a9..f9c1f7677b6 100644
--- a/kernels/portable/cpu/op_scatter_add.cpp
+++ b/kernels/portable/cpu/op_scatter_add.cpp
@@ -23,7 +23,7 @@ namespace {
 template <typename CTYPE>
 void scatter_add_helper(
     const CTYPE* src_data,
-    const int64_t* index_data,
+    const long* index_data,
     CTYPE* out_data,
     const Tensor& src,
     const Tensor& index,
@@ -81,7 +81,7 @@ Tensor& scatter_add_out(
 
   ET_SWITCH_REALHBBF16_TYPES(self_type, ctx, "scatter_add.out", CTYPE, [&]() {
     const CTYPE* self_data = self.const_data_ptr<CTYPE>();
-    const int64_t* index_data = index.const_data_ptr<int64_t>();
+    const long* index_data = index.const_data_ptr<long>();
     const CTYPE* src_data = src.const_data_ptr<CTYPE>();
     CTYPE* out_data = out.mutable_data_ptr<CTYPE>();
 
diff --git a/kernels/portable/cpu/op_topk.cpp b/kernels/portable/cpu/op_topk.cpp
index bdea02f83bc..e2143ce78d5 100644
--- a/kernels/portable/cpu/op_topk.cpp
+++ b/kernels/portable/cpu/op_topk.cpp
@@ -79,7 +79,7 @@ void perform_topk(
     elem_t* queue) {
   const CTYPE* const in_data = in.const_data_ptr<CTYPE>();
   CTYPE* values_data = values.mutable_data_ptr<CTYPE>();
-  int64_t* indices_data = indices.mutable_data_ptr<int64_t>();
+  long* indices_data = indices.mutable_data_ptr<long>();
 
   if (in.dim() == 0) {
     values_data[0] = in_data[0];
diff --git a/kernels/test/CMakeLists.txt b/kernels/test/CMakeLists.txt
index 2e488b109c1..0304d751455 100644
--- a/kernels/test/CMakeLists.txt
+++ b/kernels/test/CMakeLists.txt
@@ -26,8 +26,8 @@ foreach(kernel ${_kernels})
   set(_functions_include "#include <executorch/kernels/${kernel}/Functions.h>")
   add_custom_command(
     OUTPUT "${_wrapper_path}"
-    COMMAND ${CMAKE_COMMAND} -E make_directory ${_wrapper_dir}
-    COMMAND ${CMAKE_COMMAND} -E echo ${_functions_include} > "${_wrapper_path}"
+    COMMAND mkdir -p ${_wrapper_dir}
+    COMMAND echo ${_functions_include} > "${_wrapper_path}"
     DEPENDS
       "${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/Functions.h"
       "${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/NativeFunctions.h"
@@ -44,7 +44,7 @@ foreach(kernel ${_kernels})
   add_custom_command(
     OUTPUT "${_wrapper_dir}/supported_features.cpp"
            "${_wrapper_dir}/supported_features.h"
-    COMMAND ${CMAKE_COMMAND} -E make_directory ${_wrapper_dir}
+    COMMAND mkdir -p ${_wrapper_dir}
     COMMAND
       ${PYTHON_EXECUTABLE} kernels/test/gen_supported_features.py
       kernels/${_supported_features_kernel}/test/supported_features_def.yaml >
@@ -73,35 +73,17 @@ foreach(kernel ${_kernels})
         "${CMAKE_CURRENT_BINARY_DIR}/../../kernels/${kernel}/${kernel}_ops_lib"
     )
   endif()
-
-  # Copy with glob needs to be handle in a platform-specific manner.
-  if(CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
-    # The quoting here is complicated, because there are three levels of
-    # interpretation: CMake -> Batch -> Powershell. The invoked (batch) command
-    # should look like `powershell -Command "Copy-Item ... -Path \"...\" ...".
-    # Powershell sees `Copy-Item -Path "..." ...`.
-    set(_copy_headers_cmd
-        powershell
-        -Command
-        "Copy-Item -Path \\\"${_kernel_ops_lib_path}/*.h\\\" -Destination \\\"${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/\\\""
-    )
-  else()
-    set(_copy_headers_cmd
-        cp
-        "${_kernel_ops_lib_path}/*.h"
-        "${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/"
-    )
-  endif()
-
   add_custom_command(
     OUTPUT
       "${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/Functions.h"
       "${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/NativeFunctions.h"
       "${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/RegisterKernels.h"
     COMMAND
-      ${CMAKE_COMMAND} -E make_directory
+      mkdir -p
+      "${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/"
+    COMMAND
+      cp "${_kernel_ops_lib_path}/*.h"
       "${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/"
-    COMMAND ${_copy_headers_cmd}
     DEPENDS ${_kernel_ops_lib}
   )
 endforeach()
diff --git a/kernels/test/export_test_model.ps1 b/kernels/test/export_test_model.ps1
deleted file mode 100644
index d19e2a713d9..00000000000
--- a/kernels/test/export_test_model.ps1
+++ /dev/null
@@ -1,24 +0,0 @@
-param (
-    [string]$Modules,
-    [string]$OutDir,
-    [string]$CondaEnv
-)
-
-Set-PSDebug -Trace 1
-
-# Activate the VS dev environment - needed for dynamo. Try to use vswhere to locate the install. If not,
-# fall back to a reasonable guess for the build tools, which also happens to match the CLI setup.
-$vswherePath = "${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vswhere.exe"
-if (Test-Path $vswherePath) {
-    $vsInstallPath = & $vswherePath -latest -property installationPath
-} else {
-    $vsInstallPath = "C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\"
-}
-
-& "$vsInstallPath\Common7\Tools\Launch-VsDevShell.ps1" -Arch amd64 -SkipAutomaticLocation
-
-conda activate $CondaEnv
-
-$Modules = $Modules.Replace(" ", ",")
-echo "Modules: $Modules"
-python -m test.models.export_program --modules "$Modules" --outdir "$OutDir"
diff --git a/pytest-windows.ini b/pytest-windows.ini
index 0959318afdd..0eb30e3583d 100644
--- a/pytest-windows.ini
+++ b/pytest-windows.ini
@@ -100,7 +100,6 @@ addopts =
     #extension/llm/export
     --deselect=extension/pybindings/test/test_pybindings.py::PybindingsTest::test_method_quantized_ops
     --deselect=extension/pybindings/test/test_pybindings.py::PybindingsTest::test_quantized_ops
-    --deselect=extension/pybindings/test/test_pybindings.py::PybindingsTest::test_program_data_separation
     --deselect=runtime/test/test_runtime.py::RuntimeTest::test_load_program_with_path
     --deselect=exir/backend/test/test_compatibility.py::TestCompatibility::test_compatibility_in_runtime
     --deselect=exir/backend/test/test_compatibility.py::TestCompatibility::test_compatibility_in_runtime_edge_program_manager
@@ -109,7 +108,6 @@ addopts =
     --deselect=extension/llm/custom_ops/test_sdpa_with_kv_cache.py::SDPATestForSpeculativeDecode::test_sdpa_with_cache_seq_len_130
     --deselect=devtools/inspector/tests/inspector_test.py::TestInspector::test_etrecord_populates_correct_edge_dialect_aot_intermediate_outputs
     --deselect=devtools/inspector/tests/inspector_test.py::TestInspector::test_etrecord_populates_correct_export_program_aot_intermediate_outputs
-    --deselect=runtime/test/test_runtime_etdump_gen.py::RuntimeETDumpGenTest::test_etdump_generation
 
 # run the same tests multiple times to determine their
 # flakiness status. Default to 50 re-runs
diff --git a/runtime/core/exec_aten/testing_util/tensor_util.cpp b/runtime/core/exec_aten/testing_util/tensor_util.cpp
index 058f268ba68..35ddbe8ac15 100644
--- a/runtime/core/exec_aten/testing_util/tensor_util.cpp
+++ b/runtime/core/exec_aten/testing_util/tensor_util.cpp
@@ -28,43 +28,6 @@ namespace testing {
 
 namespace {
 
-/**
- * Returns true if the two elements are close according to the description on
- * `tensors_are_close()`.
- *
- * T must be a floating point type. Non-floating point data should be compared
- * directly.
- */
-template <typename T>
-bool element_is_close(const T a, const T b, double rtol, double atol) {
-  if constexpr (is_reduced_floating_point_v<T>) {
-    // MSVC complains about ambiguous overloads, so explicitly cast to float to
-    // compare.
-    return element_is_close(
-        static_cast<float>(a), static_cast<float>(b), rtol, atol);
-  } else {
-    if (std::isnan(a) && std::isnan(b)) {
-      // NaN == NaN
-    } else if (!std::isfinite(a) && !std::isfinite(b) && ((a > 0) == (b > 0))) {
-      // -Inf == -Inf
-      // +Inf == +Inf
-    } else if (rtol == 0 && atol == 0) {
-      // Exact comparison; avoid unnecessary math.
-      if (a != b) {
-        return false;
-      }
-    } else {
-      auto allowed_error = atol + std::abs(rtol * b);
-      auto actual_error = std::abs(a - b);
-      if (!std::isfinite(actual_error) || actual_error > allowed_error) {
-        return false;
-      }
-    }
-
-    return true;
-  }
-}
-
 /**
  * Returns true if the two arrays are close according to the description on
  * `tensors_are_close()`.
@@ -92,8 +55,23 @@ bool data_is_close(
     const auto ai = a[i];
     const auto bi = b[i];
 
-    if (!element_is_close(ai, bi, rtol, atol)) {
-      return false;
+    if (std::isnan(ai) && std::isnan(bi)) {
+      // NaN == NaN
+    } else if (
+        !std::isfinite(ai) && !std::isfinite(bi) && ((ai > 0) == (bi > 0))) {
+      // -Inf == -Inf
+      // +Inf == +Inf
+    } else if (rtol == 0 && atol == 0) {
+      // Exact comparison; avoid unnecessary math.
+      if (ai != bi) {
+        return false;
+      }
+    } else {
+      auto allowed_error = atol + std::abs(rtol * bi);
+      auto actual_error = std::abs(ai - bi);
+      if (!std::isfinite(actual_error) || actual_error > allowed_error) {
+        return false;
+      }
     }
   }
   return true;
diff --git a/runtime/executor/test/CMakeLists.txt b/runtime/executor/test/CMakeLists.txt
index 05d149ab1b4..d8df1f9ea56 100644
--- a/runtime/executor/test/CMakeLists.txt
+++ b/runtime/executor/test/CMakeLists.txt
@@ -17,31 +17,6 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 
 include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
 
-if(CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
-  # Use a wrapper script to set up the environment for MSVC to make Dynamo
-  # export work.
-  set(_export_program_cmd
-      powershell
-      ${EXECUTORCH_ROOT}/kernels/test/export_test_model.ps1
-      -Modules
-      "\"ModuleAdd,ModuleAddHalf,ModuleAddMul,ModuleDynamicCatUnallocatedIO,ModuleIndex,ModuleMultipleEntry,ModuleSimpleTrain,ModuleStateful\""
-      -outDir
-      "${CMAKE_CURRENT_BINARY_DIR}"
-      -CondaEnv
-      $ENV{CONDA_DEFAULT_ENV}
-  )
-else()
-  set(_export_program_cmd
-      ${PYTHON_EXECUTABLE}
-      -m
-      test.models.export_program
-      --modules
-      "ModuleAdd,ModuleAddHalf,ModuleAddMul,ModuleDynamicCatUnallocatedIO,ModuleIndex,ModuleMultipleEntry,ModuleSimpleTrain,ModuleStateful"
-      --outdir
-      "${CMAKE_CURRENT_BINARY_DIR}"
-  )
-endif()
-
 add_custom_command(
   OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/ModuleAdd.pte"
          "${CMAKE_CURRENT_BINARY_DIR}/ModuleAddHalf.pte"
@@ -54,14 +29,17 @@ add_custom_command(
          "${CMAKE_CURRENT_BINARY_DIR}/ModuleSimpleTrain.pte"
          "${CMAKE_CURRENT_BINARY_DIR}/ModuleStateful.pte"
          "${CMAKE_CURRENT_BINARY_DIR}/delegated/ModuleAddMul.pte"
-  COMMAND ${_export_program_cmd}
+  COMMAND
+    ${PYTHON_EXECUTABLE} -m test.models.export_program --modules
+    "ModuleAdd,ModuleAddHalf,ModuleAddMul,ModuleDynamicCatUnallocatedIO,ModuleIndex,ModuleMultipleEntry,ModuleSimpleTrain,ModuleStateful"
+    --outdir "${CMAKE_CURRENT_BINARY_DIR}"
   COMMAND
     ${PYTHON_EXECUTABLE} -m test.models.export_program --modules "ModuleAddMul"
     --external-constants --outdir "${CMAKE_CURRENT_BINARY_DIR}"
   COMMAND
     ${PYTHON_EXECUTABLE} -m test.models.export_delegated_program --modules
     "ModuleAddMul" --backend_id "StubBackend" --outdir
-    "${CMAKE_CURRENT_BINARY_DIR}/delegated/"
+    "${CMAKE_CURRENT_BINARY_DIR}/delegated/" || true
   WORKING_DIRECTORY ${EXECUTORCH_ROOT}
 )
 
diff --git a/runtime/kernel/test/CMakeLists.txt b/runtime/kernel/test/CMakeLists.txt
index a8166017e53..c70ec5d135b 100644
--- a/runtime/kernel/test/CMakeLists.txt
+++ b/runtime/kernel/test/CMakeLists.txt
@@ -39,6 +39,12 @@ add_test(kernel_runtime_context_test kernel_runtime_context_test)
 add_executable(
   operator_registry_max_kernel_num_test
   operator_registry_max_kernel_num_test.cpp
+  ../operator_registry.cpp
+  ../../core/evalue.cpp
+  ../../platform/abort.cpp
+  ../../platform/log.cpp
+  ../../platform/runtime.cpp
+  ../../platform/default/posix.cpp
 )
 target_link_libraries(
   operator_registry_max_kernel_num_test GTest::gtest GTest::gtest_main
diff --git a/test/end2end/exported_module.py b/test/end2end/exported_module.py
index 750b9097335..e5630b8e89f 100644
--- a/test/end2end/exported_module.py
+++ b/test/end2end/exported_module.py
@@ -187,6 +187,7 @@ def __init__(self, method):
                         if method_name_to_dynamic_shapes
                         else None
                     ),
+                    strict=True,
                 )
 
         exec_prog = to_edge(
diff --git a/test/models/export_delegated_program.py b/test/models/export_delegated_program.py
index 98f4b0b9b36..8f7c388d7ad 100644
--- a/test/models/export_delegated_program.py
+++ b/test/models/export_delegated_program.py
@@ -155,9 +155,9 @@ def forward(self, *args, **kwargs):
 
     if method_name != "forward":
         # Only require wrapper module if we're exporting a specific method other than forward.
-        exported_program = export(WrapperModule(eager_module), args=inputs)
+        exported_program = export(WrapperModule(eager_module), args=inputs, strict=True)
     else:
-        exported_program = export(eager_module, args=inputs)
+        exported_program = export(eager_module, args=inputs, strict=True)
 
     edge_config = EdgeCompileConfig(_check_ir_validity=False)
     et_config = exir.ExecutorchBackendConfig(
@@ -178,7 +178,7 @@ def forward(self, *args, **kwargs):
                 module=tagged_module,
                 gen_tag_fn=lambda x: module_class.__name__,
             )
-            exported_program = export(tagged_module, args=inputs)
+            exported_program = export(tagged_module, args=inputs, strict=True)
         executorch_program = to_edge_transform_and_lower(
             exported_program,
             compile_config=edge_config,
@@ -205,7 +205,7 @@ def forward(self, *args, **kwargs):
         composite_module(*inputs)
 
         executorch_program = to_edge(
-            export(composite_module, args=inputs)
+            export(composite_module, args=inputs, strict=True)
         ).to_executorch(config=et_config)
 
     return executorch_program
diff --git a/tools/cmake/Utils.cmake b/tools/cmake/Utils.cmake
index 77918ebbf2e..1e0671eb920 100644
--- a/tools/cmake/Utils.cmake
+++ b/tools/cmake/Utils.cmake
@@ -62,8 +62,6 @@ endfunction()
 function(target_link_options_gc_sections target_name)
   if(APPLE)
     target_link_options(${target_name} PRIVATE "LINKER:-dead_strip")
-  elseif(WIN32)
-    target_link_options(${target_name} PRIVATE "LINKER:/OPT:REF")
   else()
     target_link_options(${target_name} PRIVATE "LINKER:--gc-sections")
   endif()