pytorch · vanbasten23 · Aug 29, 2023 · Aug 29, 2023 · Aug 29, 2023 · Aug 29, 2023
diff --git a/.circleci/common.sh b/.circleci/common.sh
@@ -159,6 +159,9 @@ function run_torch_xla_tests() {
         # PJRT_DEVICE=GPU python test/test_train_mp_imagenet_fsdp.py --fake_data --use_nested_fsdp --use_small_fake_sample --num_epochs=1
         # PJRT_DEVICE=GPU python test/test_train_mp_imagenet_fsdp.py --fake_data --auto_wrap_policy type_based --use_small_fake_sample --num_epochs=1
         # XLA_DISABLE_FUNCTIONALIZATION=1 PJRT_DEVICE=GPU python test/test_train_mp_imagenet_fsdp.py --fake_data --use_nested_fsdp --use_small_fake_sample --num_epochs=1
+        PJRT_DEVICE=GPU python test/test_train_mp_imagenet_fsdp.py --fake_data --use_nested_fsdp --use_small_fake_sample --num_epochs=1
+        PJRT_DEVICE=GPU python test/test_train_mp_imagenet_fsdp.py --fake_data --auto_wrap_policy type_based --use_small_fake_sample --num_epochs=1
+        XLA_DISABLE_FUNCTIONALIZATION=1 PJRT_DEVICE=GPU python test/test_train_mp_imagenet_fsdp.py --fake_data --use_nested_fsdp --use_small_fake_sample --num_epochs=1
         # Syncfree SGD optimizer tests
         if [ -d ./torch_xla/amp/syncfree ]; then
           echo "Running Syncfree Optimizer Test"

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
@@ -43,7 +43,7 @@ jobs:
     needs: build
     with:
       docker-image: ${{ needs.build.outputs.docker-image }}
-      runner: linux.8xlarge.nvidia.gpu
+      runner: linux.g5.12xlarge.nvidia.gpu
       timeout-minutes: 240
       disable-xrt: 1
     secrets:
@@ -69,7 +69,7 @@ jobs:
     needs: build
     with:
       docker-image: ${{ needs.build.outputs.docker-image }}
-      runner: linux.8xlarge.nvidia.gpu
+      runner: linux.g5.12xlarge.nvidia.gpu
       timeout-minutes: 210
       collect-coverage: true
       disable-xrt: 1

diff --git a/.github/workflows/build_and_test_xrt.yml b/.github/workflows/build_and_test_xrt.yml
@@ -41,7 +41,7 @@ jobs:
     needs: build
     with:
       docker-image: ${{ needs.build.outputs.docker-image }}
-      runner: linux.8xlarge.nvidia.gpu
+      runner: linux.g5.12xlarge.nvidia.gpu
       timeout-minutes: 180
       disable-xrt: 0
     secrets:

diff --git a/WORKSPACE b/WORKSPACE
@@ -42,6 +42,7 @@ http_archive(
         "//openxla_patches:f16_abi_clang.diff",
         "//openxla_patches:gpu_race_condition.diff",
         "//openxla_patches:constexpr_return.diff",
+        "//openxla_patches:service.diff",
     ],
     strip_prefix = "xla-cd2cf5c34931e4fc1cacf83bfc480a5b93f05f6d",
     urls = [

diff --git a/openxla_patches/service.diff b/openxla_patches/service.diff
@@ -0,0 +1,14 @@
+delete this patch after openxla pin updated to date after or equal to 08/09
+diff --git a/xla/service/layout_assignment.cc b/xla/service/layout_assignment.cc
+index f69224f42..43feda2a0 100644
+--- a/xla/service/layout_assignment.cc
++++ b/xla/service/layout_assignment.cc
+@@ -154,7 +154,7 @@ OperandLayoutConstraint::OperandLayoutConstraint(
+       instruction_(instruction),
+       operand_no_(operand_no) {
+   CHECK(shape_layout.LayoutIsSet());
+-  CHECK(ShapeUtil::CompatibleIgnoringElementType(
++  CHECK(ShapeUtil::CompatibleKind(
+       shape_layout.shape(), instruction->operand(operand_no)->shape()))
+       << shape_layout.shape() << " is not compatible with "
+       << instruction->operand(operand_no)->shape() << " (for operand "
diff --git a/test/run_tests.sh b/test/run_tests.sh
@@ -150,7 +150,6 @@ function run_xla_op_tests {
   run_test "$CDIR/test_metrics.py"
   run_test "$CDIR/test_zero1.py"
   run_test "$CDIR/dynamo/test_dynamo_integrations_util.py"
-  run_test "$CDIR/dynamo/test_dynamo.py"
   run_test "$CDIR/dynamo/test_bridge.py"
   run_test "$CDIR/dynamo/test_num_output.py"
   run_save_tensor_ir "$CDIR/dynamo/test_dynamo_graph_dump.py"

diff --git a/test/stablehlo/test_stablehlo_compile.py b/test/stablehlo/test_stablehlo_compile.py
@@ -28,6 +28,10 @@ def test_resnet18_stablehlo_compile(self):
     xla_resnet18.eval()
     xla_input = torch_input.to(device)
     xla_output = xla_resnet18(xla_input)
+    print('xw32 cpu_output=', cpu_output)
+    print('xw32 xla_output.cpu()=', xla_output.cpu())
+    self.assertTrue(
+        torch.allclose(cpu_output, xla_output.cpu(), rtol=1e-03, atol=1e-03))
     self.assertTrue(
         torch.allclose(cpu_output, xla_output.cpu(), rtol=1e-05, atol=1e-05))
     self.assertEqual(met.counter_value('StableHloCompile'), 1)