diff --git a/benchmarks/README.md b/benchmarks/README.md
index 9476ecbcb50..fe4bdc309b4 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -77,7 +77,7 @@ Disable autoboost selecting clock rate based on thermal, and power budget effect
 Run the `experiment_runner.py` from the `pytorch` directory, which should be the
 parent of the `xla` directory.
 
-The following example runs the alexnet benchmark on GPU through the
+The following example runs the alexnet benchmark on CPU through the
 Pytorch/XLA-dynamo path and through the Inductor-dynamo with 5 repetitions each.
 The results will be stored in a json file (eg results.jsonl) in `experiment_results`.
 
@@ -88,7 +88,7 @@ python xla/benchmarks/experiment_runner.py  \
     --xla=PJRT --xla=None                   \
     --test=eval --test=train                \
     --suite-name=torchbench                 \
-    --accelerator=cuda                      \
+    --accelerator=cpu                       \
     --output-dirname=experiment_results     \
     --repeat=5                              \
     --print-subprocess                      \
@@ -118,7 +118,7 @@ python xla/benchmarks/experiment_runner.py \
     --suite-name=torchbench \
     --progress-bar  \
     --model-config='{"model_name":"BERT_pytorch"}' \
-    --experiment-config='{"accelerator":"cuda","xla":"PJRT","xla_flags":null,"dynamo":"openxla","torch_xla2":null,"test":"train","keep_model_data_on_cuda":false,"enable_functionalization":false}' \
+    --experiment-config='{"accelerator":"cpu","xla":"PJRT","xla_flags":null,"dynamo":"openxla","torch_xla2":null,"test":"train","enable_functionalization":false}' \
     --repeat 1
 ```
 
@@ -135,13 +135,13 @@ works only for inference now.
 
 ```
 cd pytorch
-PJRT_DEVICE=CUDA python3 new_xla/benchmarks/experiment_runner.py \
+PJRT_DEVICE=CPU python3 new_xla/benchmarks/experiment_runner.py \
     --xla=PJRT \
     --dynamo=openxla \
     --test=eval \
     --filter=BERT_pytorch$ \
     --suite-name=torchbench \
-    --accelerator=cuda \
+    --accelerator=cpu \
     --progress-bar \
     --output-dirname=/tmp/output \
     --repeat=2 \
diff --git a/benchmarks/benchmark_experiment.py b/benchmarks/benchmark_experiment.py
index e1fab48334a..daffdce2f7f 100644
--- a/benchmarks/benchmark_experiment.py
+++ b/benchmarks/benchmark_experiment.py
@@ -20,13 +20,12 @@ def list_experiment_configs(self):
 
     # Start with default config.
     config_choices = {
-        "accelerator": ["cpu", "cuda", "tpu"],
+        "accelerator": ["cpu", "tpu"],
         "xla": [None, "PJRT", "XRT"],
         "xla_flags": [None],
         "dynamo": [None, "inductor", "openxla"],
         "torch_xla2": [None],  # options only apply to torch_xla2
         "test": ["eval", "train"],
-        "keep_model_data_on_cuda": [False],
         "enable_functionalization": [False],
     }
 
@@ -46,10 +45,6 @@ def list_experiment_configs(self):
     if self._args.xla_flags:
       config_choices["xla_flags"] = list(
           map(parse_none_str, set(self._args.xla_flags)))
-    if self._args.keep_model_data_on_cuda:
-      config_choices["keep_model_data_on_cuda"] = [
-          self._args.keep_model_data_on_cuda
-      ]
     if self._args.enable_functionalization:
       config_choices["enable_functionalization"] = [
           self._args.enable_functionalization
@@ -85,7 +80,6 @@ def _is_available(self,
     cfg_xla = experiment_config["xla"]
     cfg_test = experiment_config["test"]
     cfg_torch_xla2 = experiment_config["torch_xla2"]
-    cfg_keep_model_data_on_cuda = experiment_config["keep_model_data_on_cuda"]
 
     # Check that dynamo refers to an existing backend.
     if cfg_dynamo is not None and cfg_dynamo not in dynamo.list_backends(
@@ -118,16 +112,16 @@ def _is_available(self,
     if cfg_accelerator == "tpu":
       if cfg_xla is None:
         return False
-    elif cfg_accelerator in ("cpu", "cuda"):
+    elif cfg_accelerator == "cpu":
       if cfg_xla == "XRT":
         return False
+    elif cfg_accelerator == "cuda":
+      if cfg_xla is not None:
+        # PyTorch/XLA with CUDA backend is no longer supported.
+        return False
     else:
       raise NotImplementedError
 
-    # cfg_keep_model_data_on_cuda is only avaible when using dynamo
-    if cfg_keep_model_data_on_cuda and cfg_dynamo != "openxla":
-      return False
-
     return True
 
   def load_experiment(self,
@@ -140,7 +134,6 @@ def load_experiment(self,
     test = experiment_config["test"]
     batch_size = experiment_config.get("batch_size", self._args.batch_size)
     torch_xla2 = experiment_config["torch_xla2"]
-    keep_model_data_on_cuda = experiment_config["keep_model_data_on_cuda"]
     enable_functionalization = experiment_config["enable_functionalization"]
     return BenchmarkExperiment(
         accelerator=accelerator,
@@ -148,7 +141,6 @@ def load_experiment(self,
         xla_flags=xla_flags,
         dynamo=dynamo,
         torch_xla2=torch_xla2,
-        keep_model_data_on_cuda=keep_model_data_on_cuda,
         test=test,
         batch_size=batch_size,
         enable_functionalization=enable_functionalization,
@@ -159,14 +151,12 @@ class BenchmarkExperiment:
 
   def __init__(self, accelerator: str, xla: Optional[str],
                xla_flags: Optional[str], dynamo: str, torch_xla2: bool,
-               keep_model_data_on_cuda: bool, test: str, batch_size: str,
-               enable_functionalization: bool):
+               test: str, batch_size: str, enable_functionalization: bool):
     self.accelerator = accelerator
     self.xla = xla
     self.xla_flags = xla_flags
     self.dynamo = dynamo
     self.torch_xla2 = torch_xla2
-    self.keep_model_data_on_cuda = keep_model_data_on_cuda
     self.test = test
     self.batch_size = batch_size
     self.accelerator_model = get_accelerator_model(self.accelerator)
@@ -191,8 +181,6 @@ def update_process_env(self, process_env: Dict[str, str]):
       if is_xla_device_available("TPU"):
         process_env["TPU_NUM_DEVICES"] = "1"
         process_env["XRT_TPU_CONFIG"] = "localservice;0;localhost:51011"
-      elif is_xla_device_available("CUDA"):
-        process_env["GPU_NUM_DEVICES"] = "1"
     elif self.xla is None:
       # In non-xla CPU training experiments, an env var is still needed if an
       # xla device exists, or there will be "Missing XLA configuration" error.
@@ -246,7 +234,6 @@ def to_dict(self):
     d["xla_flags"] = self.xla_flags
     d["dynamo"] = self.dynamo
     d["torch_xla2"] = self.torch_xla2
-    d["keep_model_data_on_cuda"] = self.keep_model_data_on_cuda
     d["test"] = self.test
     d["batch_size"] = self.batch_size
     d["enable_functionalization"] = self.enable_functionalization
diff --git a/benchmarks/benchmark_model.py b/benchmarks/benchmark_model.py
index 2b2f6c1957b..008a4539c7a 100644
--- a/benchmarks/benchmark_model.py
+++ b/benchmarks/benchmark_model.py
@@ -103,7 +103,6 @@ def prepare_for_experiment(
     else:
       raise NotImplementedError
 
-    keep_model_data_on_cuda = self.benchmark_experiment.keep_model_data_on_cuda
     if self.benchmark_experiment.torch_xla2:
       import torch_xla2.export
       import torch_xla2
@@ -125,7 +124,7 @@ def prepare_for_experiment(
       self.module = lambda *x: jax_func(weights, x)
       self.example_inputs = move_to_device(
           self.example_inputs, device, torch_xla2=True)
-    elif not keep_model_data_on_cuda:
+    else:
       self.module = self.module.to(self.device)
       self.example_inputs = move_to_device(
           self.example_inputs, self.device, torch_xla2=False)
@@ -137,14 +136,6 @@ def prepare_for_experiment(
       logger.info(f"Running torch.compile with opts {compilation_opts}")
       self.model_iter_fn = torch.compile(self.model_iter_fn, **compilation_opts)
 
-    if keep_model_data_on_cuda:
-
-      def assert_func(t):
-        assert t.device.type.lower(
-        ) == 'cuda', 'When keep_model_data_on_cuda is set, the input data should remain on the CUDA device.'
-
-      pytree.tree_map_only(torch.Tensor, assert_func, self.example_inputs)
-
   def pick_grad(self):
     if self.benchmark_experiment.test == "eval":
       return torch.no_grad()
diff --git a/benchmarks/experiment_runner.py b/benchmarks/experiment_runner.py
index b784af68e47..04a5524ad38 100644
--- a/benchmarks/experiment_runner.py
+++ b/benchmarks/experiment_runner.py
@@ -936,11 +936,6 @@ def __str__(self):
       help="""Collect CUDA and CPU times per operation. This will also gather
         CPU fallbacks.""",
   )
-  parser.add_argument(
-      "--keep-model-data-on-cuda",
-      action="store_true",
-      help="""Whether to keep the model and data on CUDA and not to move to an XLA device. This is to be used with PyTorch/XLA dynamo. When set, PyTorch/XLA dynamo bridge move the model and data to the XLA device.""",
-  )
   parser.add_argument(
       "--xla-flags",
       type=str,
diff --git a/benchmarks/llama.py b/benchmarks/llama.py
deleted file mode 100644
index 53e88ddb0d2..00000000000
--- a/benchmarks/llama.py
+++ /dev/null
@@ -1,269 +0,0 @@
-import argparse
-import datetime
-import logging
-import json
-import os
-import re
-import subprocess
-import sys
-
-from enum import Enum
-
-logger = logging.getLogger(__name__)
-
-
-def get_info_from_result_file(results_dir: str) -> tuple[str, str, float]:
-  results_file = os.path.join(results_dir, 'results.jsonl')
-  if not os.path.exists(results_file):
-    sys.exit(f"Results file {results_file} not found. "
-             "Please run experiment_runner.py first.")
-  accelerator_model = None
-  with open(results_file, 'r') as f:
-    first_line = f.readline()
-    acc_match = re.search(r'"accelerator_model": "([^"]+)"', first_line)
-    time_match = re.search(r'"timestamp": ([0-9.]+)', first_line)
-    if acc_match and time_match:
-      accelerator_model = acc_match.group(1)
-      timestamp = float(time_match.group(1))
-    else:
-      sys.exit(f"Cannot find a timestamp and a matching accelerator "
-               "in {results_file}.")
-  logger.debug(f"Found accelerator_model='{accelerator_model}' and "
-               f"timestamp={timestamp} in {results_file}.")
-  return accelerator_model, timestamp
-
-
-def set_up_llama_repo(workspace_dir: str) -> str:
-  llama_dir = os.path.join(workspace_dir, 'llama-inference')
-  if os.path.exists(llama_dir):
-    logger.debug(f'llama_dir={llama_dir} already exists; no setting up to do.')
-    return llama_dir
-
-  logger.debug(f'Setting up llama repo at {llama_dir}.')
-  subprocess.check_call([
-      'git', 'clone', 'https://github.com/pytorch-tpu/llama.git', '--branch',
-      'llama2-google-next-inference', llama_dir
-  ])
-  subprocess.check_call(
-      ['pip', 'install', '-r',
-       os.path.join(llama_dir, 'requirements.txt')])
-  subprocess.check_call(['pip', 'install', '-e', llama_dir])
-
-  # Create model JSON files
-  model_configs = {
-      '7b.json': {
-          "dim": 4096,
-          "multiple_of": 256,
-          "n_heads": 32,
-          "n_layers": 32,
-          "norm_eps": 1e-05,
-          "vocab_size": -1
-      },
-      '13b.json': {
-          "dim": 5120,
-          "multiple_of": 256,
-          "n_heads": 40,
-          "n_layers": 40,
-          "norm_eps": 1e-05,
-          "vocab_size": -1
-      },
-      '70b.json': {
-          "dim": 8192,
-          "multiple_of": 4096,
-          "ffn_dim_multiplier": 1.3,
-          "n_heads": 64,
-          "n_kv_heads": 8,
-          "n_layers": 80,
-          "norm_eps": 1e-05,
-          "vocab_size": -1
-      }
-  }
-  for filename, config in model_configs.items():
-    filepath = os.path.join(llama_dir, filename)
-    with open(filepath, 'w') as f:
-      json.dump(config, f)
-      f.write("\n")
-  return llama_dir
-
-
-def parse_log_file(log_file: str):
-  latencies = []
-  with open(log_file, 'r') as f:
-    for line in f:
-      if ('Totally decoded ' not in line or 'tokens in' not in line or
-          ' seconds' not in line):
-        continue
-      parts = line.strip().split()
-      tokens = float(parts[2])
-      seconds = float(parts[5])
-      latency_per_token = seconds / tokens
-      latencies.append(latency_per_token)
-  logger.debug(f'{log_file}: Found latencies={latencies}')
-  return latencies
-
-
-def benchmark_has_already_run(results_file: str, model_name: str, xla: str,
-                              dynamo: str, batch_size: int):
-  with open(results_file, 'r') as f:
-    for line in f:
-      # Grep for relevant lines to avoid parsing the entire JSONL file.
-      if f'"model_name": "{model_name}"' not in line:
-        continue
-      r = json.loads(line.rstrip('\n|\r'))
-      # yapf: disable
-      if all(
-          r.get(k1, {}).get(k2) == v
-          for (k1, k2, v) in [
-              ('experiment', 'accelerator', 'cuda'),
-              ('experiment', 'batch_size', batch_size),
-              ('experiment', 'dynamo', dynamo),
-              ('experiment', 'test', 'eval'),
-              ('experiment', 'xla', xla),
-              ('experiment', 'xla_flags', None),
-              ('model', 'model_name', model_name),
-          ]):
-        return True
-      # yapf: enable
-  return False
-
-
-def run_benchmarks(args, llama_dir: str, results_dir: str,
-                   accelerator_model: str, timestamp: float):
-  os.chdir(llama_dir)
-  for size in ['7b', '13b', '70b']:
-    params_json = 'params.json'
-    if os.path.exists(params_json):
-      os.remove(params_json)
-    os.symlink(f'{size}.json', params_json)
-    model_name = f"llama2.{size}"
-    for dynamo in [None, 'inductor', 'openxla']:
-      backend = dynamo if dynamo else 'lazytensor'
-      xla = None if dynamo == 'inductor' else 'PJRT'
-      summary = f"{model_name} eval {backend} batch {args.batch_size}"
-
-      results_file = os.path.join(results_dir, 'results.jsonl')
-      if benchmark_has_already_run(results_file, model_name, xla, dynamo,
-                                   args.batch_size):
-        logger.info(f"SKIP already completed benchmark -- {summary}")
-        continue
-
-      logger.info(f"RUN {summary}")
-      log_file = os.path.join(results_dir,
-                              f'llama-inference.{backend}.{size}.log')
-
-      cmd = [
-          'python', 'example_text_completion.py', '1', '--ckpt_dir', '.',
-          '--tokenizer_path',
-          os.path.join(llama_dir, 't5_tokenizer/spiece.model'), '--max_seq_len',
-          '2048', '--max_gen_len', '1000', f'--max_batch_size',
-          f'{args.batch_size}', '--mp', 'True', f'--repeat', f'{args.repeat}',
-          f'--dynamo', f'"{dynamo}"' if dynamo else "''"
-      ]
-
-      run_env = os.environ.copy()
-      if dynamo == 'inductor':
-        run_env['CUDA_VISIBLE_DEVICES'] = '0'
-        run_env['USE_CUDA'] = '1'
-      else:
-        run_env['PJRT_DEVICE'] = 'CUDA'
-        run_env['GPU_NUM_DEVICES'] = '1'
-
-      run_ok = True
-      with open(log_file, 'w') as f:
-        try:
-          subprocess.check_call(cmd, stdout=f, stderr=f, env=run_env)
-        except subprocess.CalledProcessError:
-          logger.warning(f"Run failed -- see {log_file}.")
-          run_ok = False
-
-      result = {
-          'model': {
-              'suite_name': 'llama2',
-              'model_name': model_name,
-          },
-          'experiment': {
-              'accelerator': 'cuda',
-              'accelerator_model': accelerator_model,
-              'xla': xla,
-              'xla_flags': None,
-              'dynamo': dynamo,
-              'test': 'eval',
-              'batch_size': args.batch_size,
-          },
-          'repeat': args.repeat,
-          'iterations_per_run': 1,
-          'metrics': {
-              # Filled in below.
-          },
-          'timestamp': timestamp,
-      }
-      if run_ok:
-        latencies = parse_log_file(log_file)
-        result['metrics']['total_time'] = latencies
-      else:
-        result['metrics']['error'] = f"Run failed -- see {log_file}."
-
-      with open(results_file, mode="a", encoding="utf-8") as f:
-        json.dump(result, f, ensure_ascii=False)
-        f.write("\n")
-
-
-def parse_args():
-  # Helper class for --log-level flag.
-  class LogLevel(Enum):
-    critical = logging.CRITICAL
-    error = logging.ERROR
-    warning = logging.WARNING
-    info = logging.INFO
-    debug = logging.DEBUG
-
-    @staticmethod
-    def parse(s: str):
-      try:
-        return LogLevel[s]
-      except KeyError:
-        raise ValueError()
-
-    def __str__(self):
-      return self.name
-
-  parser = argparse.ArgumentParser(description='Run Llama inference benchmarks')
-  parser.add_argument('--batch_size', type=int, default=1, help='Batch size.')
-  parser.add_argument(
-      '--log-level',
-      default=LogLevel.info,
-      choices=list(LogLevel),
-      type=LogLevel.parse,
-      help='Log level')
-  parser.add_argument(
-      '--repeat', type=int, default=8, help='Number of repetitions')
-  parser.add_argument(
-      '--workspace_dir', type=str, required=True, help='Workspace directory.')
-  args = parser.parse_args()
-
-  return args
-
-
-def main():
-  args = parse_args()
-  logging.basicConfig(level=args.log_level.value, force=True)
-  args.workspace_dir = os.path.expanduser(args.workspace_dir)
-  if not os.path.exists(args.workspace_dir):
-    sys.exit(f"Workspace directory {args.workspace_dir} not found.")
-
-  # Sanity check: we should already be inside the appropriate venv.
-  workspace_dir = os.path.realpath(args.workspace_dir)
-  logger.debug(f'workspace_dir realpath: {workspace_dir}')
-  if sys.prefix != os.path.join(workspace_dir, 'env'):
-    sys.exit(
-        "Error: must run under the Python venv from the given --workspace_dir.")
-
-  results_dir = os.path.join(workspace_dir, 'experiment_results')
-  accelerator_model, timestamp = get_info_from_result_file(results_dir)
-  llama_dir = set_up_llama_repo(workspace_dir)
-
-  run_benchmarks(args, llama_dir, results_dir, accelerator_model, timestamp)
-
-
-if __name__ == "__main__":
-  main()
diff --git a/benchmarks/nightly.sh b/benchmarks/nightly.sh
deleted file mode 100755
index 64b34055cbf..00000000000
--- a/benchmarks/nightly.sh
+++ /dev/null
@@ -1,258 +0,0 @@
-#!/bin/bash
-# Pytorch/XLA Nightly Benchmark Runner.
-
-set -ex
-
-ACCELERATOR=a100
-OUTPUT_DIR=${HOME:?}
-WORKSPACE=$(date --utc +%Y-%m-%d)
-REPEAT=8
-ENABLE_PROFILING=
-
-while getopts 'A:O:PR:T:W:' OPTION
-do
-  case ${OPTION?} in
-    A)
-      ACCELERATOR=${OPTARG:?}
-      ;;
-    O)
-      OUTPUT_DIR=${OPTARG:?}
-      ;;
-    P)
-      ENABLE_PROFILING=1
-      ;;
-    R)
-      REPEAT=${OPTARG:?}
-      ;;
-    T)
-      # Avoid printing the token; re-enable printing later.
-      { set +x; } 2>/dev/null
-      export HUGGING_FACE_HUB_TOKEN=${OPTARG:?}
-      set -x
-      ;;
-    W)
-      WORKSPACE=${OPTARG:?}
-      ;;
-  esac
-done
-
-NIGHTLY_RUNS=nightly_runs
-NIGHTLY_RESULTS=nightly_results
-if [[ ${ENABLE_PROFILING?} ]]; then
-  NIGHTLY_RUNS=nightly_profiling_runs
-  NIGHTLY_RESULTS=nightly_profiling_results
-fi
-WORKSPACE_DIR=${OUTPUT_DIR:?}/${NIGHTLY_RUNS:?}/${WORKSPACE:?}
-BM_DIR=${WORKSPACE_DIR:?}/pytorch/xla/benchmarks
-
-# Intermediate results, which are processed to generate reports.
-WORKSPACE_RESULTS_DIR=${WORKSPACE_DIR:?}/experiment_results
-
-# Final data files and reports go here.
-NIGHTLY_RESULTS_DIR=${OUTPUT_DIR:?}/${NIGHTLY_RESULTS:?}
-
-# Init workspace
-#
-# Sometimes a run fails halfway. Typically this is because
-# experiment_runner crashes. We then fix the problem and
-# run the script again, which skips the build phase.
-IS_FRESH_RUN=1  # Set to null below; read with ${IS_FRESH_RUN?}.
-if [ -d ${WORKSPACE_DIR:?} ]; then
-  IS_FRESH_RUN=
-fi
-
-if [[ ${IS_FRESH_RUN?} ]]; then
-  rm -rf ${HOME:?}/.cache/bazel
-fi
-
-mkdir -p ${WORKSPACE_DIR:?}
-cd ${WORKSPACE_DIR:?}
-
-ENV_DIR=env
-if [[ ${IS_FRESH_RUN?} ]]; then
-  python3 -m venv ${ENV_DIR:?}
-fi
-source ${ENV_DIR:?}/bin/activate
-
-# Download and build everything
-if [[ ${IS_FRESH_RUN?} ]]; then
-  # Install deps
-  pip install --upgrade pip
-
-  TIMESTAMP=$(date +%s)
-  # Clone repos first so that their HEAD is as close as possible to $TIMESTAMP.
-  git clone https://github.com/pytorch/pytorch.git
-  git clone https://github.com/pytorch/xla.git pytorch/xla
-  git clone https://github.com/pytorch/vision.git
-  git clone https://github.com/pytorch/audio.git
-  git clone https://github.com/pytorch/benchmark.git
-
-  # Set up pytorch
-  cd pytorch
-  pip install -r requirements.txt
-  make triton
-  USE_CUDA=1 python setup.py develop
-  cd ..
-
-  # Set up pytorch/xla
-  cd pytorch/xla
-  # Query local compute capability. If that fails, assign a sane default.
-  LOCAL_CAP=compute_$(nvidia-smi --query-gpu=compute_cap --format=csv | \
-    tail -1 | sed 's/\.//g' | grep -E '^[0-9]{2}$' || echo '80')
-  python setup.py develop
-  cd ../..
-
-  # Set up torchbench deps.
-  cd vision
-  python setup.py develop
-  cd ..
-  cd audio
-  python setup.py develop
-  cd ..
-
-  # Set up torchbench
-  cd benchmark
-  USE_CUDA=1 python install.py
-  cd ..
-
-  # Apply local patches
-  cd benchmark
-  git apply ../pytorch/xla/benchmarks/patches/mismatched_batch_size.patch
-  cd ..
-else
-  # Grab the timestamp from the first result, if it exists.
-  # Otherwise take the current timestamp.
-  TIMESTAMP=$(head -1 ${WORKSPACE_RESULTS_DIR:?}/results.jsonl | \
-    sed -E 's|.*\"timestamp\": ([0-9.]+).*|\1|' | \
-    grep -E '^[0-9.]+$' || date +%s)
-fi
-
-# Stabilize clock freqs
-sudo nvidia-smi --lock-gpu-clocks=1200,1200
-
-# Note: this doesn't work on GCP because it's a VM.
-# Moreover, we should look into disabling turbo boost if possible.
-#   sudo cpupower frequency-set --governor performance
-
-PROFILING_FLAGS=
-if [[ ${ENABLE_PROFILING?} ]]; then
-  PROFILING_FLAGS="--dump-dynamo-counters \
-    --collect-dynamo-counters \
-    --dump-pytorch-profiles \
-    --dump-pytorch-xla-metrics \
-    --profile-cuda-cpu \
-    --profile-cuda-cpu-individual-ops"
-fi
-
-# Run the experiments
-cd pytorch
-# Note: to avoid running in Eager mode (i.e. --xla=None --dynamo=None),
-# we split experiment_runner.py's invocation in two.
-#
-# Inference + Training: XLA Lazy tensors, XLA+XLA_Eval Dynamo.
-python xla/benchmarks/experiment_runner.py \
-       --test=eval --test=train \
-       --xla=PJRT \
-       --dynamo=None --dynamo=openxla \
-       --suite-name=torchbench --accelerator=cuda \
-       --output-dirname=${WORKSPACE_RESULTS_DIR:?} \
-       --repeat=${REPEAT:?} --print-subprocess \
-       --timestamp=${TIMESTAMP:?} ${PROFILING_FLAGS?}
-# Inference + Training: Inductor Dynamo.
-python xla/benchmarks/experiment_runner.py \
-       --test=eval --test=train \
-       --xla=None \
-       --dynamo=inductor \
-       --suite-name=torchbench --accelerator=cuda \
-       --output-dirname=${WORKSPACE_RESULTS_DIR:?} \
-       --repeat=${REPEAT:?} --print-subprocess \
-       --timestamp=${TIMESTAMP:?} ${PROFILING_FLAGS?}
-cd ..
-
-# Run Llama2 benchmarks.
-python ${BM_DIR:?}/llama.py --workspace_dir=${WORKSPACE_DIR:?}
-
-# Gather results and generate reports
-REPORTS_DIR=${NIGHTLY_RESULTS_DIR:?}/reports/${WORKSPACE:?}
-mkdir -p ${REPORTS_DIR:?}
-cp ${WORKSPACE_RESULTS_DIR:?}/results.jsonl \
-   ${NIGHTLY_RESULTS_DIR:?}/${WORKSPACE:?}.jsonl
-
-PYTORCH_GIT_REV=$(git -C pytorch rev-parse --short HEAD)
-XLA_GIT_TAG=$(git -C pytorch/xla describe --tags --always)
-GIT_TAGS="PT: ${PYTORCH_GIT_REV:?} XLA: ${XLA_GIT_TAG:?}"
-
-COMMON_TITLE_PREFIX=
-if [[ ${ENABLE_PROFILING?} ]]; then
-  COMMON_TITLE_PREFIX="[Profiling ON] "
-fi
-
-INFERENCE_BACKENDS_CMD='--backends inductor openxla+dynamo openxla+lazytensor'
-TRAINING_BACKENDS_CMD='--backends inductor openxla+dynamo openxla+lazytensor'
-
-# Skip result files coming from one-off runs.
-INPUT_JSONL_FILES=$(ls ${NIGHTLY_RESULTS_DIR:?}/*.jsonl | \
-  grep '[0-9]\+-[0-9]\+-[0-9]\+\.jsonl')
-
-for testname in inference training; do
-  for report in latest histogram speedup; do
-    for format in csv svg; do
-      for tier in '' 1; do
-        TITLE_PREFIX=
-        TIER_CMD=
-        TIER_FILE_SUFFIX=
-        if [[ ${tier?} ]]; then
-          TITLE_PREFIX="${COMMON_TITLE_PREFIX?}Tier${tier?} "
-          TIER_CMD=--filter-by-tier=${tier:?}
-          TIER_FILE_SUFFIX=-tier${tier:?}
-        fi
-
-        TITLE="(${testname:?})"
-        WIDTH=9
-        HEIGHT=7
-        if [ "${report:?}" == "latest" ]; then
-          TITLE="${WORKSPACE:?} (${testname:?}) ${GIT_TAGS:?}"
-          if [[ -z ${tier?} ]]; then
-            WIDTH=15
-            HEIGHT=8
-          fi
-        fi
-        BACKENDS_CMD=
-        if [ "${testname:?}" = 'inference' ]; then
-            BACKENDS_CMD="${INFERENCE_BACKENDS_CMD:?}"
-        else
-            BACKENDS_CMD="${TRAINING_BACKENDS_CMD:?}"
-        fi
-        python ${BM_DIR:?}/aggregate.py --accelerator=${ACCELERATOR:?} \
-               --report=${report:?} --test=${testname:?} --format=${format:?} \
-               --title="${TITLE_PREFIX?}${TITLE:?}" \
-               --fig-height=${HEIGHT:?} --fig-width=${WIDTH:?} \
-               ${TIER_CMD?} \
-               ${BACKENDS_CMD:?} -- \
-               ${INPUT_JSONL_FILES:?} \
-               > ${REPORTS_DIR:?}/${ACCELERATOR:?}-${testname:?}-${report:?}${TIER_FILE_SUFFIX?}.${format:?}
-      done
-    done
-  done
-done
-
-# Generate Llama2 output.
-for testname in inference; do
-  for report in latest_grouped; do
-    for format in csv svg tab; do
-      BACKENDS_CMD=
-      if [ "${testname:?}" = 'inference' ]; then
-        BACKENDS_CMD="${INFERENCE_BACKENDS_CMD:?}"
-      else
-        BACKENDS_CMD="${TRAINING_BACKENDS_CMD:?}"
-      fi
-      python ${BM_DIR:?}/aggregate.py --accelerator=${ACCELERATOR:?} \
-             --report=${report:?} --test=${testname:?} --format=${format:?} \
-             --title="${COMMON_TITLE_PREFIX?}Llama2 (${testname:?})" \
-             --filter='^llama2\.' \
-             ${BACKENDS_CMD:?} -- \
-             ${INPUT_JSONL_FILES:?} \
-             > ${REPORTS_DIR:?}/${ACCELERATOR:?}-${testname:?}-${report:?}-llama2.${format:?}
-    done
-  done
-done
diff --git a/benchmarks/result_analyzer.py b/benchmarks/result_analyzer.py
index 69f6b323206..3da67fb7067 100644
--- a/benchmarks/result_analyzer.py
+++ b/benchmarks/result_analyzer.py
@@ -57,7 +57,6 @@ def run_csv(self):
         "xla_flags": pd.Series(dtype="str"),
         "dynamo": pd.Series(dtype="str"),
         "torch_xla2": pd.Series(dtype="str"),
-        "keep_model_data_on_cuda": pd.Series(dtype="bool"),
         "test": pd.Series(dtype="str"),
         "batch_size": pd.Series(dtype="int"),
         "repeat": pd.Series(dtype="int"),
@@ -122,10 +121,6 @@ def extract_metrics_jsonl(self, file: str):
       dynamo_value = "None" if dynamo is None else dynamo
       torch_xla2 = dataline["experiment"]["torch_xla2"]
       torch_xla2_value = "None" if torch_xla2 is None else torch_xla2
-      keep_model_data_on_cuda = dataline["experiment"][
-          "keep_model_data_on_cuda"]
-      keep_model_data_on_cuda_value = "None" if keep_model_data_on_cuda is None else str(
-          keep_model_data_on_cuda)
       test = dataline["experiment"]["test"]
       test_value = "None" if test is None else test
       outputs_file = dataline["experiment"].get("outputs_file", None)
@@ -146,7 +141,6 @@ def extract_metrics_jsonl(self, file: str):
               "xla": xla_value,
               "dynamo": dynamo_value,
               "torch_xla2": torch_xla2_value,
-              "keep_model_data_on_cuda": keep_model_data_on_cuda_value,
               "test": test_value,
               "outputs_file": outputs_file_value
           }
@@ -180,38 +174,21 @@ def extract_metrics_csv(self, file: str, metric_df: Optional[pd.DataFrame]):
       timestamp = dataline[
           "timestamp"] if "timestamp" in dataline else self.timestamp
       d = {
-          "timestamp":
-              timestamp,
-          "suite_name":
-              dataline["model"]["suite_name"],
-          "model_name":
-              dataline["model"]["model_name"],
-          "accelerator":
-              dataline["experiment"]["accelerator"],
-          "accelerator_model":
-              dataline["experiment"]["accelerator_model"],
-          "xla":
-              dataline["experiment"]["xla"],
-          "xla_flags":
-              dataline["experiment"]["xla_flags"],
-          "dynamo":
-              dataline["experiment"]["dynamo"],
-          "torch_xla2":
-              dataline["experiment"]["torch_xla2"],
-          "keep_model_data_on_cuda":
-              dataline["experiment"]["keep_model_data_on_cuda"],
-          "test":
-              dataline["experiment"]["test"],
-          "batch_size":
-              dataline["experiment"]["batch_size"],
-          "repeat":
-              dataline["repeat"],
-          "iterations_per_run":
-              dataline["iterations_per_run"],
-          "error_message":
-              None,
-          "outputs_file":
-              dataline["experiment"].get("outputs_file", ""),
+          "timestamp": timestamp,
+          "suite_name": dataline["model"]["suite_name"],
+          "model_name": dataline["model"]["model_name"],
+          "accelerator": dataline["experiment"]["accelerator"],
+          "accelerator_model": dataline["experiment"]["accelerator_model"],
+          "xla": dataline["experiment"]["xla"],
+          "xla_flags": dataline["experiment"]["xla_flags"],
+          "dynamo": dataline["experiment"]["dynamo"],
+          "torch_xla2": dataline["experiment"]["torch_xla2"],
+          "test": dataline["experiment"]["test"],
+          "batch_size": dataline["experiment"]["batch_size"],
+          "repeat": dataline["repeat"],
+          "iterations_per_run": dataline["iterations_per_run"],
+          "error_message": None,
+          "outputs_file": dataline["experiment"].get("outputs_file", ""),
       }
 
       if "error" in dataline["metrics"] and not self._args.hide_errors:
diff --git a/benchmarks/run_benchmark.sh b/benchmarks/run_benchmark.sh
deleted file mode 100644
index 79b746c10ad..00000000000
--- a/benchmarks/run_benchmark.sh
+++ /dev/null
@@ -1,79 +0,0 @@
-#!/bin/bash
-set -exo pipefail
-CDIR="$(cd "$(dirname "$0")" ; pwd -P)"
-LOGFILE=/tmp/benchmark_test.log
-
-# Note [Keep Going]
-#
-# Set the `CONTINUE_ON_ERROR` flag to `1` to make the CI tests continue on error.
-# This will allow you to see all the failures on your PR, not stopping with the first
-# test failure like the default behavior.
-CONTINUE_ON_ERROR="${CONTINUE_ON_ERROR:-0}"
-if [[ "$CONTINUE_ON_ERROR" == "1" ]]; then
-  set +e
-fi
-
-TESTGPUVM=None
-TESTTPUVM=None
-# NUMBER=0
-
-while getopts 'G:T:' OPTION # N:
-do
-  case $OPTION in
-    G)
-      TESTGPUVM=$OPTARG
-      ;;
-    T)
-      TESTTPUVM=$OPTARG
-      ;;
-    # N)
-    #   NUMBER=$OPTARG
-    #   ;;
-  esac
-done
-shift $(($OPTIND - 1))
-
-# func for test after ssh to VM, create container and execute in container
-function benchmarking_in_container {
-  sudo docker pull gcr.io/tpu-pytorch/xla:nightly_3.8_cuda_11.8
-  sudo apt-get install -y apt-transport-https ca-certificates curl gnupg-agent    software-properties-common
-  nvidia-smi
-  distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
-  curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
-  curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
-  sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit
-  sudo systemctl restart docker
-  sudo docker run --gpus all -it -d gcr.io/tpu-pytorch/xla:nightly_3.8_cuda_11.8 bin/bash
-  sudo docker exec -it $(sudo docker ps | awk 'NR==2 { print $1 }') /bin/bash
-  # install torchbench
-  cd ~
-  git clone -b xla_benchmark https://github.com/pytorch/benchmark.git
-  cd benchmark
-  # install deps
-  pip install --pre torchvision torchaudio -i https://download.pytorch.org/whl/nightly/cu118
-  # git clone xla
-  cd ~
-  git clone -b benchmark https://github.com/pytorch/xla.git xla
-  cd ~/xla/benchmarks
-  # dry run
-  python3 experiment_runner.py --suite-name=torchbench --accelerator=gpu --progress-bar --dry-run
-  # run bechmark
-  python3 experiment_runner.py --suite-name=torchbench --accelerator=gpu --progress-bar
-  # analyze result to csv
-  python3 result_analyzer.py
-}
-
-
-
-if TESTGPUVM='1A100':
-  # ssh to 1-A100 GPUVM and test in container
-  gcloud compute ssh a100-manfei-1 --zone us-central1-c --project tpu-prod-env-one-vm -- -o ProxyCommand='corp-ssh-helper %h %p' --command=benchmarking_in_container
-elif TESTGPUVM='8A100':
-  # SSH TO 8-A100 GPUVM and test in container
-  gcloud compute ssh manfei-a100-8-new --zone us-central1-c --project tpu-prod-env-one-vm -- -o ProxyCommand='corp-ssh-helper %h %p' --command=benchmarking_in_container
-elif TESTGPUVM='4H100':
-  # ssh to 4-H100 GPUVM and test in container
-elif TESTTPUVM='v5e8':
-  # ssh to v5e-8 TPUVM and test in container
-elif TESTTPUVM='v5p8':
-  # ssh to v5p-8 TPUVM and test in container
diff --git a/benchmarks/run_single_graph_bm.sh b/benchmarks/run_single_graph_bm.sh
deleted file mode 100755
index 98e10a06d05..00000000000
--- a/benchmarks/run_single_graph_bm.sh
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/usr/bin/env bash
-
-set -ex
-
-DATE=$(date +"%Y_%m_%d_%H_%M")
-
-OUT_PATH=xla/benchmarks/bm_results/single_graph/$DATE
-mkdir -p $OUT_PATH
-
-python new_xla/benchmarks/experiment_runner.py \
-    --dynamo=inductor --dynamo=openxla \
-    --xla=None --xla=PJRT \
-    --test=eval \
-    --filter-by-single-graph \
-    --pure-wall-time \
-    --suite-name=torchbench \
-    --accelerator=cuda \
-    --output-dirname=$OUT_PATH \
-    --repeat=5 \
-    --print-subprocess \
-    --no-resume \
-    > $OUT_PATH/stdout.txt 2> $OUT_PATH/stderr.txt
-
-python3 xla/benchmarks/result_analyzer.py \
-    --output-dirname=$OUT_PATH \
-    --database=$OUT_PATH/$DATE.csv
diff --git a/benchmarks/run_top_tier_bm.sh b/benchmarks/run_top_tier_bm.sh
deleted file mode 100755
index 9b8e8eb8eb6..00000000000
--- a/benchmarks/run_top_tier_bm.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/usr/bin/env bash
-
-set -ex
-
-DATE=$(date +"%Y_%m_%d_%H_%M")
-
-OUT_PATH=xla/benchmarks/bm_results/$DATE
-mkdir -p $OUT_PATH
-
-python xla/benchmarks/experiment_runner.py \
-    --dynamo=inductor --dynamo=openxla \
-    --xla=None --xla=PJRT \
-    --test=eval --test=train \
-    --filter-by-tier=1 --filter-by-tier=2 --filter-by-tier=3 \
-    --suite-name=torchbench \
-    --accelerator=cuda \
-    --output-dirname=$OUT_PATH \
-    --repeat=5 \
-    --print-subprocess \
-    --no-resume \
-    > $OUT_PATH/stdout.txt 2> $OUT_PATH/stderr.txt
-
-python3 xla/benchmarks/result_analyzer.py \
-    --output-dirname=$OUT_PATH \
-    --database=$OUT_PATH/$DATE.csv
diff --git a/benchmarks/torchbench_model.py b/benchmarks/torchbench_model.py
index 55b7f555276..75a64fa86fd 100644
--- a/benchmarks/torchbench_model.py
+++ b/benchmarks/torchbench_model.py
@@ -273,13 +273,10 @@ def set_up(self):
 
     # Move the initialized model to XLA device if it's not there already.
     if self.benchmark_experiment.xla and not self.should_initialize_on_xla():
-      # First, move the model and the inputs to CPU.
-      # This avoids having dupplicated data on CUDA.
-      keep_model_data_on_cuda = self.benchmark_experiment.keep_model_data_on_cuda
-      if self.is_accelerator_cuda() and not keep_model_data_on_cuda:
-        self.module = self.module.to("cpu")
-        self.example_inputs = move_to_device(self.example_inputs, "cpu")
-        cleanup(self.is_accelerator_cuda())
+      assert not self.is_accelerator_cuda()
+      self.module = self.module.to("cpu")
+      self.example_inputs = move_to_device(self.example_inputs, "cpu")
+      cleanup()
 
     # Torchbench has quite different setup for yolov3, so directly passing
     # the right example_inputs
diff --git a/benchmarks/util.py b/benchmarks/util.py
index bdd965a46a9..3c13232af2f 100644
--- a/benchmarks/util.py
+++ b/benchmarks/util.py
@@ -51,18 +51,9 @@ def deterministic_torch_manual_seed(*args, **kwargs):
 
 @functools.lru_cache(maxsize=3)
 def is_xla_device_available(devkind, use_xla2: bool = False):
-  if devkind not in ["CPU", "CUDA", "TPU"]:
+  if devkind not in ["CPU", "TPU"]:
     raise ValueError(devkind)
   # Checking the availability of a given device kind.
-  #
-  # We intentionally use subprocess instead of multiprocessing library. The
-  # reason being that we might initialize CUDA in the parent process and use
-  # CUDA in the child process. This is a known limitation of using CUDA and
-  # forking the process.
-  #
-  # In this case, subprocess works because it replaces the forked memory with
-  # the execution of the new program (fresh memory), avoiding the error.
-  #
   # For more information: https://github.com/pytorch/xla/pull/5960
   CHECK_XLA_DEVICE_PY = "check_xla_device.py"
   python_file = os.path.join(os.path.dirname(__file__), CHECK_XLA_DEVICE_PY)
@@ -80,7 +71,7 @@ def move_to_device(item, device, torch_xla2: bool = False):
 
     def move_to_device_func(tensor: torch.Tensor) -> torch.Tensor:
       # If `tensor` is an XLA tensor, first move it to CPU. We need to do
-      # that if we want to move the tensor to, say, CUDA.
+      # that if we want to move the tensor to TPU.
       if tensor.device.type == "xla":
         return tensor.cpu().to(device)
       return tensor.to(device)
diff --git a/benchmarks/verifier.py b/benchmarks/verifier.py
index d2e940711dd..4fefc509cc5 100644
--- a/benchmarks/verifier.py
+++ b/benchmarks/verifier.py
@@ -152,7 +152,7 @@ def maybe_synchronize():
       # Delete the model for saving up memory.
       del model
       # Clean-up CUDA as well.
-      cleanup(cuda=True)
+      cleanup(cuda=experiment_config["accelerator"] == "cuda")
 
 
 def _apply_eager_config(experiment):
diff --git a/test/benchmarks/test_benchmark_experiment.py b/test/benchmarks/test_benchmark_experiment.py
index 2c5efcd0583..841beb519e0 100644
--- a/test/benchmarks/test_benchmark_experiment.py
+++ b/test/benchmarks/test_benchmark_experiment.py
@@ -7,16 +7,15 @@ class BenchmarkExperimentTest(unittest.TestCase):
 
   def test_to_dict(self):
     be = BenchmarkExperiment("cpu", "PJRT", "some xla_flags", "openxla", None,
-                             False, "train", "123", False)
+                             "train", "123", False)
     actual = be.to_dict()
-    self.assertEqual(10, len(actual))
+    self.assertEqual(9, len(actual))
     self.assertEqual("cpu", actual["accelerator"])
     self.assertTrue("accelerator_model" in actual)
     self.assertEqual("PJRT", actual["xla"])
     self.assertEqual("some xla_flags", actual["xla_flags"])
     self.assertEqual("openxla", actual["dynamo"])
     self.assertEqual(None, actual["torch_xla2"])
-    self.assertEqual(False, actual["keep_model_data_on_cuda"])
     self.assertEqual("train", actual["test"])
     self.assertEqual("123", actual["batch_size"])
     self.assertEqual(False, actual["enable_functionalization"])
diff --git a/test/benchmarks/test_experiment_runner.py b/test/benchmarks/test_experiment_runner.py
index 4ce4167d0e4..e1c572e402f 100644
--- a/test/benchmarks/test_experiment_runner.py
+++ b/test/benchmarks/test_experiment_runner.py
@@ -29,44 +29,15 @@ def test_dummy_dry_run(self):
     expected_in_stderr = [
         "Number of selected experiment configs: 4",
         "Number of selected model configs: 1",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"eval\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": false}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"train\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": false}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"eval\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": false}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"train\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"eval\", \"enable_functionalization\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"train\", \"enable_functionalization\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"eval\", \"enable_functionalization\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"train\", \"enable_functionalization\": false}",
     ]
     for expected in expected_in_stderr:
       self.assertIn(expected, child.stderr)
 
-  @absltest.skipUnless(xr.device_type() in {'CUDA'}, 'Needs CUDA accelerator')
-  def test_dummy_dry_run_cuda(self):
-    child = subprocess.run([
-        "python",
-        EXPERIMENT_RUNNER_PY,
-        "--dynamo=openxla",
-        "--dynamo=inductor",
-        "--xla=PJRT",
-        "--xla=None",
-        "--test=eval",
-        "--test=train",
-        "--suite-name=dummy",
-        "--accelerator=cuda",
-        "--dry-run",
-    ],
-                           capture_output=True,
-                           text=True)
-    expected_in_stderr = [
-        "Number of selected experiment configs: 4",
-        "Number of selected model configs: 1",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"eval\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": false}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"train\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": false}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"eval\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": false}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"train\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": false}",
-    ]
-    for expected in expected_in_stderr:
-      self.assertIn(expected, child.stderr)
-
-  @absltest.skipUnless(xr.device_type() in {'CUDA'}, 'Needs CUDA accelerator')
-  def test_dummy_dry_run_inductor_cuda(self):
+  def test_dummy_dry_run_inductor_cpu(self):
     child = subprocess.run([
         "python",
         EXPERIMENT_RUNNER_PY,
@@ -85,14 +56,13 @@ def test_dummy_dry_run_inductor_cuda(self):
     expected_in_stderr = [
         "Number of selected experiment configs: 2",
         "Number of selected model configs: 1",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"eval\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": false}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"train\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"eval\", \"enable_functionalization\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"train\", \"enable_functionalization\": false}",
     ]
     for expected in expected_in_stderr:
       self.assertIn(expected, child.stderr)
 
-  @absltest.skipUnless(xr.device_type() in {'CUDA'}, 'Needs CUDA accelerator')
-  def test_dummy_openxla_train_cuda(self):
+  def test_dummy_openxla_train_cpu(self):
     child = subprocess.run([
         "python",
         EXPERIMENT_RUNNER_PY,
@@ -103,7 +73,7 @@ def test_dummy_openxla_train_cuda(self):
         "--test=eval",
         "--test=train",
         "--suite-name=dummy",
-        "--accelerator=cuda",
+        "--accelerator=cpu",
         "--filter=^dummy$",
         "--dry-run",
     ],
@@ -112,21 +82,20 @@ def test_dummy_openxla_train_cuda(self):
     expected_in_stderr = [
         "Number of selected experiment configs: 4",
         "Number of selected model configs: 1",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"train\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": false}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"eval\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": false}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"eval\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": false}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"train\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"train\", \"enable_functionalization\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"eval\", \"enable_functionalization\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"eval\", \"enable_functionalization\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"train\", \"enable_functionalization\": false}",
     ]
     for expected in expected_in_stderr:
       self.assertIn(expected, child.stderr)
 
-  @absltest.skipUnless(xr.device_type() in {'CUDA'}, 'Needs CUDA accelerator')
-  def test_dummy_dynamo_none_cuda(self):
+  def test_dummy_dynamo_none_cpu(self):
     child = subprocess.run([
         "python",
         EXPERIMENT_RUNNER_PY,
         "--suite-name=dummy",
-        "--accelerator=cuda",
+        "--accelerator=cpu",
         "--xla=PJRT",
         "--xla=None",
         "--filter=^dummy$",
@@ -137,39 +106,14 @@ def test_dummy_dynamo_none_cuda(self):
     expected_in_stderr = [
         "Number of selected experiment configs: 8",
         "Number of selected model configs: 1",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": null, \"torch_xla2\": null, \"test\": \"eval\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": false}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": null, \"torch_xla2\": null, \"test\": \"train\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": false}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"eval\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": false}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"train\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": false}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": null, \"torch_xla2\": null, \"test\": \"eval\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": false}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": null, \"torch_xla2\": null, \"test\": \"train\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": false}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"eval\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": false}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"train\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": false}",
-    ]
-    for expected in expected_in_stderr:
-      self.assertIn(expected, child.stderr)
-
-  @absltest.skipUnless(xr.device_type() in {'CUDA'}, 'Needs CUDA accelerator')
-  def test_dummy_dry_run_cuda_with_keep_model_data_on_cuda(self):
-    child = subprocess.run([
-        "python",
-        EXPERIMENT_RUNNER_PY,
-        "--dynamo=openxla",
-        "--xla=PJRT",
-        "--test=eval",
-        "--test=train",
-        "--suite-name=dummy",
-        "--accelerator=cuda",
-        "--keep-model-data-on-cuda",
-        "--dry-run",
-    ],
-                           capture_output=True,
-                           text=True)
-    expected_in_stderr = [
-        "Number of selected experiment configs: 2",
-        "Number of selected model configs: 1",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"eval\", \"keep_model_data_on_cuda\": true, \"enable_functionalization\": false}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"train\", \"keep_model_data_on_cuda\": true, \"enable_functionalization\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": null, \"torch_xla2\": null, \"test\": \"eval\", \"enable_functionalization\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": null, \"torch_xla2\": null, \"test\": \"train\", \"enable_functionalization\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"eval\", \"enable_functionalization\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"train\", \"enable_functionalization\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": null, \"xla_flags\": null, \"dynamo\": null, \"torch_xla2\": null, \"test\": \"eval\", \"enable_functionalization\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": null, \"xla_flags\": null, \"dynamo\": null, \"torch_xla2\": null, \"test\": \"train\", \"enable_functionalization\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"eval\", \"enable_functionalization\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"train\", \"enable_functionalization\": false}",
     ]
     for expected in expected_in_stderr:
       self.assertIn(expected, child.stderr)
@@ -192,8 +136,8 @@ def test_dummy_dry_run_with_functionalization(self):
     expected_in_stderr = [
         "Number of selected experiment configs: 2",
         "Number of selected model configs: 1",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"eval\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": true}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"train\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": true}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"eval\", \"enable_functionalization\": true}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"train\", \"enable_functionalization\": true}",
     ]
     for expected in expected_in_stderr:
       self.assertIn(expected, child.stderr)