pytorch · ysiraichi · Sep 3, 2025 · Sep 2, 2025 · Sep 2, 2025 · Sep 2, 2025
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -77,7 +77,7 @@ Disable autoboost selecting clock rate based on thermal, and power budget effect
 Run the `experiment_runner.py` from the `pytorch` directory, which should be the
 parent of the `xla` directory.
 
-The following example runs the alexnet benchmark on GPU through the
+The following example runs the alexnet benchmark on CPU through the
 Pytorch/XLA-dynamo path and through the Inductor-dynamo with 5 repetitions each.
 The results will be stored in a json file (eg results.jsonl) in `experiment_results`.
 
@@ -88,7 +88,7 @@ python xla/benchmarks/experiment_runner.py  \
     --xla=PJRT --xla=None                   \
     --test=eval --test=train                \
     --suite-name=torchbench                 \
-    --accelerator=cuda                      \
+    --accelerator=cpu                       \
     --output-dirname=experiment_results     \
     --repeat=5                              \
     --print-subprocess                      \
@@ -118,7 +118,7 @@ python xla/benchmarks/experiment_runner.py \
     --suite-name=torchbench \
     --progress-bar  \
     --model-config='{"model_name":"BERT_pytorch"}' \
-    --experiment-config='{"accelerator":"cuda","xla":"PJRT","xla_flags":null,"dynamo":"openxla","torch_xla2":null,"test":"train","keep_model_data_on_cuda":false,"enable_functionalization":false}' \
+    --experiment-config='{"accelerator":"cpu","xla":"PJRT","xla_flags":null,"dynamo":"openxla","torch_xla2":null,"test":"train","enable_functionalization":false}' \
     --repeat 1
 ```
 
@@ -135,13 +135,13 @@ works only for inference now.
 
 ```
 cd pytorch
-PJRT_DEVICE=CUDA python3 new_xla/benchmarks/experiment_runner.py \
+PJRT_DEVICE=CPU python3 new_xla/benchmarks/experiment_runner.py \
     --xla=PJRT \
     --dynamo=openxla \
     --test=eval \
     --filter=BERT_pytorch$ \
     --suite-name=torchbench \
-    --accelerator=cuda \
+    --accelerator=cpu \
     --progress-bar \
     --output-dirname=/tmp/output \
     --repeat=2 \

diff --git a/benchmarks/benchmark_experiment.py b/benchmarks/benchmark_experiment.py
@@ -20,13 +20,12 @@ def list_experiment_configs(self):
 
     # Start with default config.
     config_choices = {
-        "accelerator": ["cpu", "cuda", "tpu"],
+        "accelerator": ["cpu", "tpu"],
         "xla": [None, "PJRT", "XRT"],
         "xla_flags": [None],
         "dynamo": [None, "inductor", "openxla"],
         "torch_xla2": [None],  # options only apply to torch_xla2
         "test": ["eval", "train"],
-        "keep_model_data_on_cuda": [False],
         "enable_functionalization": [False],
     }
 
@@ -46,10 +45,6 @@ def list_experiment_configs(self):
     if self._args.xla_flags:
       config_choices["xla_flags"] = list(
           map(parse_none_str, set(self._args.xla_flags)))
-    if self._args.keep_model_data_on_cuda:
-      config_choices["keep_model_data_on_cuda"] = [
-          self._args.keep_model_data_on_cuda
-      ]
     if self._args.enable_functionalization:
       config_choices["enable_functionalization"] = [
           self._args.enable_functionalization
@@ -85,7 +80,6 @@ def _is_available(self,
     cfg_xla = experiment_config["xla"]
     cfg_test = experiment_config["test"]
     cfg_torch_xla2 = experiment_config["torch_xla2"]
-    cfg_keep_model_data_on_cuda = experiment_config["keep_model_data_on_cuda"]
 
     # Check that dynamo refers to an existing backend.
     if cfg_dynamo is not None and cfg_dynamo not in dynamo.list_backends(
@@ -118,16 +112,16 @@ def _is_available(self,
     if cfg_accelerator == "tpu":
       if cfg_xla is None:
         return False
-    elif cfg_accelerator in ("cpu", "cuda"):
+    elif cfg_accelerator == "cpu":
       if cfg_xla == "XRT":
         return False
+    elif cfg_accelerator == "cuda":
+      if cfg_xla is not None:
+        # PyTorch/XLA with CUDA backend is no longer supported.
+        return False
     else:
       raise NotImplementedError
 
-    # cfg_keep_model_data_on_cuda is only avaible when using dynamo
-    if cfg_keep_model_data_on_cuda and cfg_dynamo != "openxla":
-      return False
-
     return True
 
   def load_experiment(self,
@@ -140,15 +134,13 @@ def load_experiment(self,
     test = experiment_config["test"]
     batch_size = experiment_config.get("batch_size", self._args.batch_size)
     torch_xla2 = experiment_config["torch_xla2"]
-    keep_model_data_on_cuda = experiment_config["keep_model_data_on_cuda"]
     enable_functionalization = experiment_config["enable_functionalization"]
     return BenchmarkExperiment(
         accelerator=accelerator,
         xla=xla,
         xla_flags=xla_flags,
         dynamo=dynamo,
         torch_xla2=torch_xla2,
-        keep_model_data_on_cuda=keep_model_data_on_cuda,
         test=test,
         batch_size=batch_size,
         enable_functionalization=enable_functionalization,
@@ -159,14 +151,12 @@ class BenchmarkExperiment:
 
   def __init__(self, accelerator: str, xla: Optional[str],
                xla_flags: Optional[str], dynamo: str, torch_xla2: bool,
-               keep_model_data_on_cuda: bool, test: str, batch_size: str,
-               enable_functionalization: bool):
+               test: str, batch_size: str, enable_functionalization: bool):
     self.accelerator = accelerator
     self.xla = xla
     self.xla_flags = xla_flags
     self.dynamo = dynamo
     self.torch_xla2 = torch_xla2
-    self.keep_model_data_on_cuda = keep_model_data_on_cuda
     self.test = test
     self.batch_size = batch_size
     self.accelerator_model = get_accelerator_model(self.accelerator)
@@ -191,8 +181,6 @@ def update_process_env(self, process_env: Dict[str, str]):
       if is_xla_device_available("TPU"):
         process_env["TPU_NUM_DEVICES"] = "1"
         process_env["XRT_TPU_CONFIG"] = "localservice;0;localhost:51011"
-      elif is_xla_device_available("CUDA"):
-        process_env["GPU_NUM_DEVICES"] = "1"
     elif self.xla is None:
       # In non-xla CPU training experiments, an env var is still needed if an
       # xla device exists, or there will be "Missing XLA configuration" error.
@@ -246,7 +234,6 @@ def to_dict(self):
     d["xla_flags"] = self.xla_flags
     d["dynamo"] = self.dynamo
     d["torch_xla2"] = self.torch_xla2
-    d["keep_model_data_on_cuda"] = self.keep_model_data_on_cuda
     d["test"] = self.test
     d["batch_size"] = self.batch_size
     d["enable_functionalization"] = self.enable_functionalization

diff --git a/benchmarks/benchmark_model.py b/benchmarks/benchmark_model.py
@@ -103,7 +103,6 @@ def prepare_for_experiment(
     else:
       raise NotImplementedError
 
-    keep_model_data_on_cuda = self.benchmark_experiment.keep_model_data_on_cuda
     if self.benchmark_experiment.torch_xla2:
       import torch_xla2.export
       import torch_xla2
@@ -125,7 +124,7 @@ def prepare_for_experiment(
       self.module = lambda *x: jax_func(weights, x)
       self.example_inputs = move_to_device(
           self.example_inputs, device, torch_xla2=True)
-    elif not keep_model_data_on_cuda:
+    else:
       self.module = self.module.to(self.device)
       self.example_inputs = move_to_device(
           self.example_inputs, self.device, torch_xla2=False)
@@ -137,14 +136,6 @@ def prepare_for_experiment(
       logger.info(f"Running torch.compile with opts {compilation_opts}")
       self.model_iter_fn = torch.compile(self.model_iter_fn, **compilation_opts)
 
-    if keep_model_data_on_cuda:
-
-      def assert_func(t):
-        assert t.device.type.lower(
-        ) == 'cuda', 'When keep_model_data_on_cuda is set, the input data should remain on the CUDA device.'
-
-      pytree.tree_map_only(torch.Tensor, assert_func, self.example_inputs)
-
   def pick_grad(self):
     if self.benchmark_experiment.test == "eval":
       return torch.no_grad()

diff --git a/benchmarks/experiment_runner.py b/benchmarks/experiment_runner.py
@@ -936,11 +936,6 @@ def __str__(self):
       help="""Collect CUDA and CPU times per operation. This will also gather
         CPU fallbacks.""",
   )
-  parser.add_argument(
-      "--keep-model-data-on-cuda",
-      action="store_true",
-      help="""Whether to keep the model and data on CUDA and not to move to an XLA device. This is to be used with PyTorch/XLA dynamo. When set, PyTorch/XLA dynamo bridge move the model and data to the XLA device.""",
-  )
   parser.add_argument(
       "--xla-flags",
       type=str,