From b40411e4ec39d9c6a7b43a67863f8a6a47e59d25 Mon Sep 17 00:00:00 2001
From: iefgnoix <isaacwxf23@gmail.com>
Date: Mon, 3 Jun 2024 18:14:10 +0000
Subject: [PATCH 1/8] add how to run benchmark script for a single config
 without spawning processes.

---
 benchmarks/README.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 5b694a401581..f8b637f4399c 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -74,6 +74,14 @@ among the flags `--dynamo`, `--xla`, and `--test`, 4 of which are supported:
   - `dynamo=inductor`, `xla=None`, `test=train`
 
 
+## Run benchmarking for a single configuration
+
+The section `Experiment runner` above shows how to run the benchmarking script for a combination of configurations. For each configuration,
+the script starts a process and run the benchmarking. This section shows how to run the benchmarking for a single configuration without spawning new processes.
+
+
+
+
 ## Verification module
 
 Verification flag, enabled by running the experiment runner script with `--verify`

From 797e9f3c54d94fe030dd6197fe043f63e3c43c53 Mon Sep 17 00:00:00 2001
From: iefgnoix <isaacwxf23@gmail.com>
Date: Tue, 4 Jun 2024 21:54:51 +0000
Subject: [PATCH 2/8] Add options in benchmark script to keep model on CUDA.

---
 benchmarks/README.md               | 11 ++++++++++-
 benchmarks/benchmark_experiment.py | 14 +++++++++++++-
 benchmarks/benchmark_model.py      | 11 ++++++++---
 benchmarks/experiment_runner.py    |  9 +++++++++
 benchmarks/torchbench_model.py     |  4 +++-
 5 files changed, 43 insertions(+), 6 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index f8b637f4399c..20cf06aa98f1 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -79,7 +79,16 @@ among the flags `--dynamo`, `--xla`, and `--test`, 4 of which are supported:
 The section `Experiment runner` above shows how to run the benchmarking script for a combination of configurations. For each configuration,
 the script starts a process and run the benchmarking. This section shows how to run the benchmarking for a single configuration without spawning new processes.
 
-
+```
+cd pytorch
+python xla/benchmarks/experiment_runner.py \
+    --suite-name=torchbench \
+    --accelerator=cuda \
+    --progress-bar  \
+    --model-config=\{\"model_name\":\"BERT_pytorch\"\} \
+    --experiment-config=\{\"accelerator\":\"cuda\",\"xla\":\"PJRT\",\"xla_flags\":null,\"dynamo\":\"openxla\",\"torch_xla2\":null,\"test\":\"train\"\} \
+    --repeat 1
+```
 
 
 ## Verification module
diff --git a/benchmarks/benchmark_experiment.py b/benchmarks/benchmark_experiment.py
index 26a56c49f249..081d5a16f2ee 100644
--- a/benchmarks/benchmark_experiment.py
+++ b/benchmarks/benchmark_experiment.py
@@ -24,6 +24,7 @@ def list_experiment_configs(self):
         "dynamo": [None, "inductor", "openxla_eval", "openxla"],
         "torch_xla2": [None],  # options only apply to torch_xla2
         "test": ["eval", "train"],
+        "xla_take_cuda_model_and_data": [False],
     }
 
     # Apply command line choices.
@@ -42,6 +43,8 @@ def list_experiment_configs(self):
     if self._args.xla_flags:
       config_choices["xla_flags"] = list(
           map(parse_none_str, set(self._args.xla_flags)))
+    if self._args.xla_take_cuda_model_and_data:
+      config_choices["xla_take_cuda_model_and_data"] = [self._args.xla_take_cuda_model_and_data]
 
     # Expand experiment configs and add env vars.
     logger.debug(f"Expand experiment configs")
@@ -71,6 +74,7 @@ def _is_available(self, experiment_config):
     cfg_xla = experiment_config["xla"]
     cfg_test = experiment_config["test"]
     cfg_torch_xla2 = experiment_config["torch_xla2"]
+    cfg_xla_take_cuda_model_and_data = experiment_config["xla_take_cuda_model_and_data"]
 
     # Check that dynamo refers to an existing backend.
     if cfg_dynamo is not None and cfg_dynamo not in dynamo.list_backends(
@@ -109,6 +113,10 @@ def _is_available(self, experiment_config):
       pass
     else:
       raise NotImplementedError
+    
+    # cfg_xla_take_cuda_model_and_data is only avaible when using dynamo
+    if cfg_xla_take_cuda_model_and_data and cfg_dynamo != "openxla":
+      return False
 
     return True
 
@@ -120,25 +128,28 @@ def load_experiment(self, experiment_config):
     test = experiment_config["test"]
     batch_size = experiment_config.get("batch_size", self._args.batch_size)
     torch_xla2 = experiment_config["torch_xla2"]
+    xla_take_cuda_model_and_data = experiment_config["xla_take_cuda_model_and_data"]
     return BenchmarkExperiment(
         accelerator=accelerator,
         xla=xla,
         xla_flags=xla_flags,
         dynamo=dynamo,
         torch_xla2=torch_xla2,
+        xla_take_cuda_model_and_data=xla_take_cuda_model_and_data,
         test=test,
         batch_size=batch_size)
 
 
 class BenchmarkExperiment:
 
-  def __init__(self, accelerator, xla, xla_flags, dynamo, torch_xla2, test,
+  def __init__(self, accelerator, xla, xla_flags, dynamo, torch_xla2, xla_take_cuda_model_and_data, test,
                batch_size):
     self.accelerator = accelerator
     self.xla = xla
     self.xla_flags = xla_flags
     self.dynamo = dynamo
     self.torch_xla2 = torch_xla2
+    self.xla_take_cuda_model_and_data = xla_take_cuda_model_and_data
     self.test = test
     self.batch_size = batch_size
     self.accelerator_model = get_accelerator_model(self.accelerator)
@@ -202,6 +213,7 @@ def to_dict(self):
     d["xla_flags"] = self.xla_flags
     d["dynamo"] = self.dynamo
     d["torch_xla2"] = self.torch_xla2
+    d["xla_take_cuda_model_and_data"] = self.xla_take_cuda_model_and_data
     d["test"] = self.test
     d["batch_size"] = self.batch_size
     return d
diff --git a/benchmarks/benchmark_model.py b/benchmarks/benchmark_model.py
index 62b53e126a0c..12d725256932 100644
--- a/benchmarks/benchmark_model.py
+++ b/benchmarks/benchmark_model.py
@@ -110,6 +110,7 @@ def conversion_dtype(self):
 
   def prepare_for_experiment(self, dynamo_compilation_opts):
     self.device = self.benchmark_experiment.get_device()
+    print('xw32 line113 prepare_for_experiment begin self.device=', self.device)
     self.dtype = self.conversion_dtype()
     if self.dtype is not None:
       self.module = self.module.to(self.dtype)
@@ -122,6 +123,7 @@ def prepare_for_experiment(self, dynamo_compilation_opts):
     else:
       raise NotImplementedError
 
+    keep_model_data_on_cuda = self.benchmark_experiment.xla_take_cuda_model_and_data
     if self.benchmark_experiment.torch_xla2:
       import torch_xla2.export
       import torch_xla2
@@ -142,11 +144,11 @@ def prepare_for_experiment(self, dynamo_compilation_opts):
       jax_func = jax.jit(jax_func)
       self.module = lambda *x: jax_func(weights, x)
       self.example_inputs = move_to_device(self.example_inputs, device,
-                                           self.benchmark_experiment.torch_xla2)
-    else:
+                                           torch_xla2=True)
+    elif not keep_model_data_on_cuda:
       self.module = self.module.to(self.device)
       self.example_inputs = move_to_device(self.example_inputs, self.device,
-                                           self.benchmark_experiment.torch_xla2)
+                                           torch_xla2=False)
 
     if self.benchmark_experiment.dynamo:
       compilation_opts = dynamo_compilation_opts.copy()
@@ -154,6 +156,9 @@ def prepare_for_experiment(self, dynamo_compilation_opts):
 
       logger.info(f"Running torch.compile with opts {compilation_opts}")
       self.model_iter_fn = torch.compile(self.model_iter_fn, **compilation_opts)
+    
+    if keep_model_data_on_cuda:
+      assert self.example_inputs[0].device.type.lower() == 'cuda', 'When xla_take_cuda_model_and_data is set, the input data should remain on the CUDA device.'
 
   def pick_grad(self):
     if self.benchmark_experiment.test == "eval":
diff --git a/benchmarks/experiment_runner.py b/benchmarks/experiment_runner.py
index 11719b476ec2..b3c236a9469e 100644
--- a/benchmarks/experiment_runner.py
+++ b/benchmarks/experiment_runner.py
@@ -54,8 +54,10 @@ def run(self):
     is_main_process = self._args.experiment_config is None and \
       self._args.model_config is None
     if is_main_process:
+      print('xw32 is_main_process is true')
       self.generate_and_run_all_configs()
     else:
+      print('xw32 is_main_process is false')
       assert self._args.experiment_config is not None and \
         self._args.model_config is not None
       self.run_single_config()
@@ -241,6 +243,7 @@ def _pure_wall_time_iter_fn(self, benchmark_experiment, benchmark_model,
   def run_single_config(self):
 
     # Load experiment and model.
+    print('xw32 line246 run_single_config self._args.experiment_config=', self._args.experiment_config)
     experiment_config = json.loads(self._args.experiment_config)
     model_config = json.loads(self._args.model_config)
     benchmark_experiment = self.experiment_loader.load_experiment(
@@ -862,6 +865,11 @@ def __str__(self):
       help="""Collect CUDA and CPU times per operation. This will also gather
         CPU fallbacks.""",
   )
+  parser.add_argument(
+      "--xla-take-cuda-model-and-data",
+      action="store_true",
+      help="""Whether PyTorch/XLA takes models and data on CUDA.""",
+  )
   parser.add_argument(
       "--xla-flags",
       type=str,
@@ -942,6 +950,7 @@ def main():
 
   logging.basicConfig(level=args.log_level.value, force=True)
   logger.debug(f"Parsed args: {args}")
+  # args.xla_take_cuda_model_and_data available
 
   if not args.disable_tf32:
     logger.warning('Enabling fast F32 multiplication for PyTorch')
diff --git a/benchmarks/torchbench_model.py b/benchmarks/torchbench_model.py
index 838a50141876..2d9e8d41ac10 100644
--- a/benchmarks/torchbench_model.py
+++ b/benchmarks/torchbench_model.py
@@ -255,8 +255,10 @@ def set_up(self):
     if self.benchmark_experiment.xla:
       # First, move the model and the inputs to CPU.
       # This avoids having dupplicated data on CUDA.
-      if self.is_accelerator_cuda():
+      keep_model_data_on_cuda = self.benchmark_experiment.xla_take_cuda_model_and_data
+      if self.is_accelerator_cuda() and not keep_model_data_on_cuda:
         self.module = self.module.to("cpu")
+        print('xw32 line260 self.example_inputs[0].device=', self.example_inputs[0].device)
         self.example_inputs = move_to_device(self.example_inputs, "cpu")
         self._cleanup()
 

From edafc66042f45d1681b471b65751ed57cb17584c Mon Sep 17 00:00:00 2001
From: iefgnoix <isaacwxf23@gmail.com>
Date: Tue, 4 Jun 2024 22:07:05 +0000
Subject: [PATCH 3/8] fix linter

---
 benchmarks/benchmark_experiment.py | 16 ++++++++++------
 benchmarks/benchmark_model.py      | 13 +++++++------
 benchmarks/experiment_runner.py    |  3 ++-
 benchmarks/torchbench_model.py     |  3 ++-
 4 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/benchmarks/benchmark_experiment.py b/benchmarks/benchmark_experiment.py
index 081d5a16f2ee..34d2e0db73ad 100644
--- a/benchmarks/benchmark_experiment.py
+++ b/benchmarks/benchmark_experiment.py
@@ -44,7 +44,9 @@ def list_experiment_configs(self):
       config_choices["xla_flags"] = list(
           map(parse_none_str, set(self._args.xla_flags)))
     if self._args.xla_take_cuda_model_and_data:
-      config_choices["xla_take_cuda_model_and_data"] = [self._args.xla_take_cuda_model_and_data]
+      config_choices["xla_take_cuda_model_and_data"] = [
+          self._args.xla_take_cuda_model_and_data
+      ]
 
     # Expand experiment configs and add env vars.
     logger.debug(f"Expand experiment configs")
@@ -74,7 +76,8 @@ def _is_available(self, experiment_config):
     cfg_xla = experiment_config["xla"]
     cfg_test = experiment_config["test"]
     cfg_torch_xla2 = experiment_config["torch_xla2"]
-    cfg_xla_take_cuda_model_and_data = experiment_config["xla_take_cuda_model_and_data"]
+    cfg_xla_take_cuda_model_and_data = experiment_config[
+        "xla_take_cuda_model_and_data"]
 
     # Check that dynamo refers to an existing backend.
     if cfg_dynamo is not None and cfg_dynamo not in dynamo.list_backends(
@@ -113,7 +116,7 @@ def _is_available(self, experiment_config):
       pass
     else:
       raise NotImplementedError
-    
+
     # cfg_xla_take_cuda_model_and_data is only avaible when using dynamo
     if cfg_xla_take_cuda_model_and_data and cfg_dynamo != "openxla":
       return False
@@ -128,7 +131,8 @@ def load_experiment(self, experiment_config):
     test = experiment_config["test"]
     batch_size = experiment_config.get("batch_size", self._args.batch_size)
     torch_xla2 = experiment_config["torch_xla2"]
-    xla_take_cuda_model_and_data = experiment_config["xla_take_cuda_model_and_data"]
+    xla_take_cuda_model_and_data = experiment_config[
+        "xla_take_cuda_model_and_data"]
     return BenchmarkExperiment(
         accelerator=accelerator,
         xla=xla,
@@ -142,8 +146,8 @@ def load_experiment(self, experiment_config):
 
 class BenchmarkExperiment:
 
-  def __init__(self, accelerator, xla, xla_flags, dynamo, torch_xla2, xla_take_cuda_model_and_data, test,
-               batch_size):
+  def __init__(self, accelerator, xla, xla_flags, dynamo, torch_xla2,
+               xla_take_cuda_model_and_data, test, batch_size):
     self.accelerator = accelerator
     self.xla = xla
     self.xla_flags = xla_flags
diff --git a/benchmarks/benchmark_model.py b/benchmarks/benchmark_model.py
index 12d725256932..449c74803a5a 100644
--- a/benchmarks/benchmark_model.py
+++ b/benchmarks/benchmark_model.py
@@ -143,12 +143,12 @@ def prepare_for_experiment(self, dynamo_compilation_opts):
                                      weights)
       jax_func = jax.jit(jax_func)
       self.module = lambda *x: jax_func(weights, x)
-      self.example_inputs = move_to_device(self.example_inputs, device,
-                                           torch_xla2=True)
+      self.example_inputs = move_to_device(
+          self.example_inputs, device, torch_xla2=True)
     elif not keep_model_data_on_cuda:
       self.module = self.module.to(self.device)
-      self.example_inputs = move_to_device(self.example_inputs, self.device,
-                                           torch_xla2=False)
+      self.example_inputs = move_to_device(
+          self.example_inputs, self.device, torch_xla2=False)
 
     if self.benchmark_experiment.dynamo:
       compilation_opts = dynamo_compilation_opts.copy()
@@ -156,9 +156,10 @@ def prepare_for_experiment(self, dynamo_compilation_opts):
 
       logger.info(f"Running torch.compile with opts {compilation_opts}")
       self.model_iter_fn = torch.compile(self.model_iter_fn, **compilation_opts)
-    
+
     if keep_model_data_on_cuda:
-      assert self.example_inputs[0].device.type.lower() == 'cuda', 'When xla_take_cuda_model_and_data is set, the input data should remain on the CUDA device.'
+      assert self.example_inputs[0].device.type.lower(
+      ) == 'cuda', 'When xla_take_cuda_model_and_data is set, the input data should remain on the CUDA device.'
 
   def pick_grad(self):
     if self.benchmark_experiment.test == "eval":
diff --git a/benchmarks/experiment_runner.py b/benchmarks/experiment_runner.py
index b3c236a9469e..29c1168beafb 100644
--- a/benchmarks/experiment_runner.py
+++ b/benchmarks/experiment_runner.py
@@ -243,7 +243,8 @@ def _pure_wall_time_iter_fn(self, benchmark_experiment, benchmark_model,
   def run_single_config(self):
 
     # Load experiment and model.
-    print('xw32 line246 run_single_config self._args.experiment_config=', self._args.experiment_config)
+    print('xw32 line246 run_single_config self._args.experiment_config=',
+          self._args.experiment_config)
     experiment_config = json.loads(self._args.experiment_config)
     model_config = json.loads(self._args.model_config)
     benchmark_experiment = self.experiment_loader.load_experiment(
diff --git a/benchmarks/torchbench_model.py b/benchmarks/torchbench_model.py
index 2d9e8d41ac10..42ffec82b210 100644
--- a/benchmarks/torchbench_model.py
+++ b/benchmarks/torchbench_model.py
@@ -258,7 +258,8 @@ def set_up(self):
       keep_model_data_on_cuda = self.benchmark_experiment.xla_take_cuda_model_and_data
       if self.is_accelerator_cuda() and not keep_model_data_on_cuda:
         self.module = self.module.to("cpu")
-        print('xw32 line260 self.example_inputs[0].device=', self.example_inputs[0].device)
+        print('xw32 line260 self.example_inputs[0].device=',
+              self.example_inputs[0].device)
         self.example_inputs = move_to_device(self.example_inputs, "cpu")
         self._cleanup()
 

From 009b83f9a633d2bd38ad9cdb90cacc9614cda9c6 Mon Sep 17 00:00:00 2001
From: iefgnoix <isaacwxf23@gmail.com>
Date: Tue, 4 Jun 2024 22:10:35 +0000
Subject: [PATCH 4/8] remove unwanted comments

---
 benchmarks/benchmark_model.py   | 1 -
 benchmarks/experiment_runner.py | 4 ----
 benchmarks/torchbench_model.py  | 2 --
 3 files changed, 7 deletions(-)

diff --git a/benchmarks/benchmark_model.py b/benchmarks/benchmark_model.py
index 449c74803a5a..3846ab965373 100644
--- a/benchmarks/benchmark_model.py
+++ b/benchmarks/benchmark_model.py
@@ -110,7 +110,6 @@ def conversion_dtype(self):
 
   def prepare_for_experiment(self, dynamo_compilation_opts):
     self.device = self.benchmark_experiment.get_device()
-    print('xw32 line113 prepare_for_experiment begin self.device=', self.device)
     self.dtype = self.conversion_dtype()
     if self.dtype is not None:
       self.module = self.module.to(self.dtype)
diff --git a/benchmarks/experiment_runner.py b/benchmarks/experiment_runner.py
index 29c1168beafb..abf564ec0a99 100644
--- a/benchmarks/experiment_runner.py
+++ b/benchmarks/experiment_runner.py
@@ -54,10 +54,8 @@ def run(self):
     is_main_process = self._args.experiment_config is None and \
       self._args.model_config is None
     if is_main_process:
-      print('xw32 is_main_process is true')
       self.generate_and_run_all_configs()
     else:
-      print('xw32 is_main_process is false')
       assert self._args.experiment_config is not None and \
         self._args.model_config is not None
       self.run_single_config()
@@ -243,8 +241,6 @@ def _pure_wall_time_iter_fn(self, benchmark_experiment, benchmark_model,
   def run_single_config(self):
 
     # Load experiment and model.
-    print('xw32 line246 run_single_config self._args.experiment_config=',
-          self._args.experiment_config)
     experiment_config = json.loads(self._args.experiment_config)
     model_config = json.loads(self._args.model_config)
     benchmark_experiment = self.experiment_loader.load_experiment(
diff --git a/benchmarks/torchbench_model.py b/benchmarks/torchbench_model.py
index 42ffec82b210..2e8a60b436d8 100644
--- a/benchmarks/torchbench_model.py
+++ b/benchmarks/torchbench_model.py
@@ -258,8 +258,6 @@ def set_up(self):
       keep_model_data_on_cuda = self.benchmark_experiment.xla_take_cuda_model_and_data
       if self.is_accelerator_cuda() and not keep_model_data_on_cuda:
         self.module = self.module.to("cpu")
-        print('xw32 line260 self.example_inputs[0].device=',
-              self.example_inputs[0].device)
         self.example_inputs = move_to_device(self.example_inputs, "cpu")
         self._cleanup()
 

From c55e6f54c0f342fc39ee374cd7cd2534748d8918 Mon Sep 17 00:00:00 2001
From: iefgnoix <isaacwxf23@gmail.com>
Date: Wed, 5 Jun 2024 21:43:35 +0000
Subject: [PATCH 5/8] fix comments and tests

---
 benchmarks/README.md                         |  5 +-
 benchmarks/benchmark_experiment.py           |  2 +-
 benchmarks/experiment_runner.py              |  1 -
 test/benchmarks/test_benchmark_experiment.py |  5 +-
 test/benchmarks/test_experiment_runner.py    | 73 +++++++++++++-------
 5 files changed, 55 insertions(+), 31 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 20cf06aa98f1..60bbda146b42 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -83,10 +83,9 @@ the script starts a process and run the benchmarking. This section shows how to
 cd pytorch
 python xla/benchmarks/experiment_runner.py \
     --suite-name=torchbench \
-    --accelerator=cuda \
     --progress-bar  \
-    --model-config=\{\"model_name\":\"BERT_pytorch\"\} \
-    --experiment-config=\{\"accelerator\":\"cuda\",\"xla\":\"PJRT\",\"xla_flags\":null,\"dynamo\":\"openxla\",\"torch_xla2\":null,\"test\":\"train\"\} \
+    --model-config='{"model_name":"BERT_pytorch"}' \
+    --experiment-config='{"accelerator":"cuda","xla":"PJRT","xla_flags":null,"dynamo":"openxla","torch_xla2":null,"test":"train","xla_take_cuda_model_and_data":"false"}' \
     --repeat 1
 ```
 
diff --git a/benchmarks/benchmark_experiment.py b/benchmarks/benchmark_experiment.py
index 34d2e0db73ad..9eaabfe8505f 100644
--- a/benchmarks/benchmark_experiment.py
+++ b/benchmarks/benchmark_experiment.py
@@ -147,7 +147,7 @@ def load_experiment(self, experiment_config):
 class BenchmarkExperiment:
 
   def __init__(self, accelerator, xla, xla_flags, dynamo, torch_xla2,
-               xla_take_cuda_model_and_data, test, batch_size):
+               xla_take_cuda_model_and_data: bool, test, batch_size):
     self.accelerator = accelerator
     self.xla = xla
     self.xla_flags = xla_flags
diff --git a/benchmarks/experiment_runner.py b/benchmarks/experiment_runner.py
index abf564ec0a99..8ffac1ab32fe 100644
--- a/benchmarks/experiment_runner.py
+++ b/benchmarks/experiment_runner.py
@@ -947,7 +947,6 @@ def main():
 
   logging.basicConfig(level=args.log_level.value, force=True)
   logger.debug(f"Parsed args: {args}")
-  # args.xla_take_cuda_model_and_data available
 
   if not args.disable_tf32:
     logger.warning('Enabling fast F32 multiplication for PyTorch')
diff --git a/test/benchmarks/test_benchmark_experiment.py b/test/benchmarks/test_benchmark_experiment.py
index 461df56687f1..3d8e85e0b360 100644
--- a/test/benchmarks/test_benchmark_experiment.py
+++ b/test/benchmarks/test_benchmark_experiment.py
@@ -6,16 +6,17 @@
 class BenchmarkExperimentTest(unittest.TestCase):
 
   def test_to_dict(self):
-    be = BenchmarkExperiment("cpu", "PJRT", "some xla_flags", "openxla", None,
+    be = BenchmarkExperiment("cpu", "PJRT", "some xla_flags", "openxla", None, False,
                              "train", "123")
     actual = be.to_dict()
-    self.assertEqual(8, len(actual))
+    self.assertEqual(9, len(actual))
     self.assertEqual("cpu", actual["accelerator"])
     self.assertTrue("accelerator_model" in actual)
     self.assertEqual("PJRT", actual["xla"])
     self.assertEqual("some xla_flags", actual["xla_flags"])
     self.assertEqual("openxla", actual["dynamo"])
     self.assertEqual(None, actual["torch_xla2"])
+    self.assertEqual(False, actual["xla_take_cuda_model_and_data"])
     self.assertEqual("train", actual["test"])
     self.assertEqual("123", actual["batch_size"])
 
diff --git a/test/benchmarks/test_experiment_runner.py b/test/benchmarks/test_experiment_runner.py
index 81386d7d82d3..9459388dee77 100644
--- a/test/benchmarks/test_experiment_runner.py
+++ b/test/benchmarks/test_experiment_runner.py
@@ -29,10 +29,10 @@ def test_dummy_dry_run(self):
     expected_in_stderr = [
         "Number of selected experiment configs: 4",
         "Number of selected model configs: 1",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"eval\"}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"train\"}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"eval\"}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"train\"}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"eval\", \"xla_take_cuda_model_and_data\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"train\", \"xla_take_cuda_model_and_data\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"eval\", \"xla_take_cuda_model_and_data\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"train\", \"xla_take_cuda_model_and_data\": false}",
     ]
     for expected in expected_in_stderr:
       self.assertIn(expected, child.stderr)
@@ -57,10 +57,10 @@ def test_dummy_dry_run_cuda(self):
     expected_in_stderr = [
         "Number of selected experiment configs: 4",
         "Number of selected model configs: 1",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"eval\"}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"train\"}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"eval\"}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"train\"}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"eval\", \"xla_take_cuda_model_and_data\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"train\", \"xla_take_cuda_model_and_data\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"eval\", \"xla_take_cuda_model_and_data\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"train\", \"xla_take_cuda_model_and_data\": false}",
     ]
     for expected in expected_in_stderr:
       self.assertIn(expected, child.stderr)
@@ -85,8 +85,8 @@ def test_dummy_dry_run_inductor_cuda(self):
     expected_in_stderr = [
         "Number of selected experiment configs: 2",
         "Number of selected model configs: 1",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"eval\"}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"train\"}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"eval\", \"xla_take_cuda_model_and_data\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"train\", \"xla_take_cuda_model_and_data\": false}",
     ]
     for expected in expected_in_stderr:
       self.assertIn(expected, child.stderr)
@@ -113,11 +113,11 @@ def test_dummy_openxla_eval_train_cuda(self):
     expected_in_stderr = [
         "Number of selected experiment configs: 5",
         "Number of selected model configs: 1",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla_eval\", \"torch_xla2\": null, \"test\": \"eval\"}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"train\"}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"eval\"}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"eval\"}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"train\"}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla_eval\", \"torch_xla2\": null, \"test\": \"eval\", \"xla_take_cuda_model_and_data\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"train\", \"xla_take_cuda_model_and_data\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"eval\", \"xla_take_cuda_model_and_data\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"eval\", \"xla_take_cuda_model_and_data\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"train\", \"xla_take_cuda_model_and_data\": false}",
     ]
     for expected in expected_in_stderr:
       self.assertIn(expected, child.stderr)
@@ -139,15 +139,40 @@ def test_dummy_dynamo_none_cuda(self):
     expected_in_stderr = [
         "Number of selected experiment configs: 9",
         "Number of selected model configs: 1",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": null, \"torch_xla2\": null, \"test\": \"eval\"}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": null, \"torch_xla2\": null, \"test\": \"train\"}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla_eval\", \"torch_xla2\": null, \"test\": \"eval\"}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"eval\"}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"train\"}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": null, \"torch_xla2\": null, \"test\": \"eval\"}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": null, \"torch_xla2\": null, \"test\": \"train\"}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"eval\"}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"train\"}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": null, \"torch_xla2\": null, \"test\": \"eval\", \"xla_take_cuda_model_and_data\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": null, \"torch_xla2\": null, \"test\": \"train\", \"xla_take_cuda_model_and_data\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla_eval\", \"torch_xla2\": null, \"test\": \"eval\", \"xla_take_cuda_model_and_data\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"eval\", \"xla_take_cuda_model_and_data\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"train\", \"xla_take_cuda_model_and_data\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": null, \"torch_xla2\": null, \"test\": \"eval\", \"xla_take_cuda_model_and_data\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": null, \"torch_xla2\": null, \"test\": \"train\", \"xla_take_cuda_model_and_data\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"eval\", \"xla_take_cuda_model_and_data\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"train\", \"xla_take_cuda_model_and_data\": false}",
+    ]
+    for expected in expected_in_stderr:
+      self.assertIn(expected, child.stderr)
+
+  @absltest.skipUnless(xr.device_type() in {'CUDA'}, 'Needs CUDA accelerator')
+  def test_dummy_dry_run_cuda_with_xla_take_cuda_model_and_data(self):
+    child = subprocess.run([
+        "python",
+        EXPERIMENT_RUNNER_PY,
+        "--dynamo=openxla",
+        "--xla=PJRT",
+        "--test=eval",
+        "--test=train",
+        "--suite-name=dummy",
+        "--accelerator=cuda",
+        "--xla-take-cuda-model-and-data",
+        "--dry-run",
+    ],
+                           capture_output=True,
+                           text=True)
+    expected_in_stderr = [
+        "Number of selected experiment configs: 2",
+        "Number of selected model configs: 1",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"eval\", \"xla_take_cuda_model_and_data\": true}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"train\", \"xla_take_cuda_model_and_data\": true}",
     ]
     for expected in expected_in_stderr:
       self.assertIn(expected, child.stderr)

From bbdaa9bd3f8819bb963f76209fd87c04d67e6a69 Mon Sep 17 00:00:00 2001
From: iefgnoix <isaacwxf23@gmail.com>
Date: Wed, 5 Jun 2024 21:52:56 +0000
Subject: [PATCH 6/8] rename xla_take_cuda_model_and_data to
 keep_model_data_on_cuda

---
 benchmarks/README.md                         |  2 +-
 benchmarks/benchmark_experiment.py           | 26 +++++-----
 benchmarks/benchmark_model.py                |  4 +-
 benchmarks/experiment_runner.py              |  4 +-
 benchmarks/torchbench_model.py               |  2 +-
 test/benchmarks/test_benchmark_experiment.py |  6 +--
 test/benchmarks/test_experiment_runner.py    | 54 ++++++++++----------
 7 files changed, 48 insertions(+), 50 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 60bbda146b42..a407ae2cef13 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -85,7 +85,7 @@ python xla/benchmarks/experiment_runner.py \
     --suite-name=torchbench \
     --progress-bar  \
     --model-config='{"model_name":"BERT_pytorch"}' \
-    --experiment-config='{"accelerator":"cuda","xla":"PJRT","xla_flags":null,"dynamo":"openxla","torch_xla2":null,"test":"train","xla_take_cuda_model_and_data":"false"}' \
+    --experiment-config='{"accelerator":"cuda","xla":"PJRT","xla_flags":null,"dynamo":"openxla","torch_xla2":null,"test":"train","keep_model_data_on_cuda":"false"}' \
     --repeat 1
 ```
 
diff --git a/benchmarks/benchmark_experiment.py b/benchmarks/benchmark_experiment.py
index 9eaabfe8505f..5deed5db5e53 100644
--- a/benchmarks/benchmark_experiment.py
+++ b/benchmarks/benchmark_experiment.py
@@ -24,7 +24,7 @@ def list_experiment_configs(self):
         "dynamo": [None, "inductor", "openxla_eval", "openxla"],
         "torch_xla2": [None],  # options only apply to torch_xla2
         "test": ["eval", "train"],
-        "xla_take_cuda_model_and_data": [False],
+        "keep_model_data_on_cuda": [False],
     }
 
     # Apply command line choices.
@@ -43,9 +43,9 @@ def list_experiment_configs(self):
     if self._args.xla_flags:
       config_choices["xla_flags"] = list(
           map(parse_none_str, set(self._args.xla_flags)))
-    if self._args.xla_take_cuda_model_and_data:
-      config_choices["xla_take_cuda_model_and_data"] = [
-          self._args.xla_take_cuda_model_and_data
+    if self._args.keep_model_data_on_cuda:
+      config_choices["keep_model_data_on_cuda"] = [
+          self._args.keep_model_data_on_cuda
       ]
 
     # Expand experiment configs and add env vars.
@@ -76,8 +76,7 @@ def _is_available(self, experiment_config):
     cfg_xla = experiment_config["xla"]
     cfg_test = experiment_config["test"]
     cfg_torch_xla2 = experiment_config["torch_xla2"]
-    cfg_xla_take_cuda_model_and_data = experiment_config[
-        "xla_take_cuda_model_and_data"]
+    cfg_keep_model_data_on_cuda = experiment_config["keep_model_data_on_cuda"]
 
     # Check that dynamo refers to an existing backend.
     if cfg_dynamo is not None and cfg_dynamo not in dynamo.list_backends(
@@ -117,8 +116,8 @@ def _is_available(self, experiment_config):
     else:
       raise NotImplementedError
 
-    # cfg_xla_take_cuda_model_and_data is only avaible when using dynamo
-    if cfg_xla_take_cuda_model_and_data and cfg_dynamo != "openxla":
+    # cfg_keep_model_data_on_cuda is only avaible when using dynamo
+    if cfg_keep_model_data_on_cuda and cfg_dynamo != "openxla":
       return False
 
     return True
@@ -131,15 +130,14 @@ def load_experiment(self, experiment_config):
     test = experiment_config["test"]
     batch_size = experiment_config.get("batch_size", self._args.batch_size)
     torch_xla2 = experiment_config["torch_xla2"]
-    xla_take_cuda_model_and_data = experiment_config[
-        "xla_take_cuda_model_and_data"]
+    keep_model_data_on_cuda = experiment_config["keep_model_data_on_cuda"]
     return BenchmarkExperiment(
         accelerator=accelerator,
         xla=xla,
         xla_flags=xla_flags,
         dynamo=dynamo,
         torch_xla2=torch_xla2,
-        xla_take_cuda_model_and_data=xla_take_cuda_model_and_data,
+        keep_model_data_on_cuda=keep_model_data_on_cuda,
         test=test,
         batch_size=batch_size)
 
@@ -147,13 +145,13 @@ def load_experiment(self, experiment_config):
 class BenchmarkExperiment:
 
   def __init__(self, accelerator, xla, xla_flags, dynamo, torch_xla2,
-               xla_take_cuda_model_and_data: bool, test, batch_size):
+               keep_model_data_on_cuda: bool, test, batch_size):
     self.accelerator = accelerator
     self.xla = xla
     self.xla_flags = xla_flags
     self.dynamo = dynamo
     self.torch_xla2 = torch_xla2
-    self.xla_take_cuda_model_and_data = xla_take_cuda_model_and_data
+    self.keep_model_data_on_cuda = keep_model_data_on_cuda
     self.test = test
     self.batch_size = batch_size
     self.accelerator_model = get_accelerator_model(self.accelerator)
@@ -217,7 +215,7 @@ def to_dict(self):
     d["xla_flags"] = self.xla_flags
     d["dynamo"] = self.dynamo
     d["torch_xla2"] = self.torch_xla2
-    d["xla_take_cuda_model_and_data"] = self.xla_take_cuda_model_and_data
+    d["keep_model_data_on_cuda"] = self.keep_model_data_on_cuda
     d["test"] = self.test
     d["batch_size"] = self.batch_size
     return d
diff --git a/benchmarks/benchmark_model.py b/benchmarks/benchmark_model.py
index 3846ab965373..16e003d66937 100644
--- a/benchmarks/benchmark_model.py
+++ b/benchmarks/benchmark_model.py
@@ -122,7 +122,7 @@ def prepare_for_experiment(self, dynamo_compilation_opts):
     else:
       raise NotImplementedError
 
-    keep_model_data_on_cuda = self.benchmark_experiment.xla_take_cuda_model_and_data
+    keep_model_data_on_cuda = self.benchmark_experiment.keep_model_data_on_cuda
     if self.benchmark_experiment.torch_xla2:
       import torch_xla2.export
       import torch_xla2
@@ -158,7 +158,7 @@ def prepare_for_experiment(self, dynamo_compilation_opts):
 
     if keep_model_data_on_cuda:
       assert self.example_inputs[0].device.type.lower(
-      ) == 'cuda', 'When xla_take_cuda_model_and_data is set, the input data should remain on the CUDA device.'
+      ) == 'cuda', 'When keep_model_data_on_cuda is set, the input data should remain on the CUDA device.'
 
   def pick_grad(self):
     if self.benchmark_experiment.test == "eval":
diff --git a/benchmarks/experiment_runner.py b/benchmarks/experiment_runner.py
index 8ffac1ab32fe..66717c21f0e2 100644
--- a/benchmarks/experiment_runner.py
+++ b/benchmarks/experiment_runner.py
@@ -863,9 +863,9 @@ def __str__(self):
         CPU fallbacks.""",
   )
   parser.add_argument(
-      "--xla-take-cuda-model-and-data",
+      "--keep-model-data-on-cuda",
       action="store_true",
-      help="""Whether PyTorch/XLA takes models and data on CUDA.""",
+      help="""Whether to keep the model and data on CUDA and not to move to an XLA device. This is to be used with PyTorch/XLA dynamo. When set, PyTorch/XLA dynamo bridge move the model and data to the XLA device.""",
   )
   parser.add_argument(
       "--xla-flags",
diff --git a/benchmarks/torchbench_model.py b/benchmarks/torchbench_model.py
index 2e8a60b436d8..c75a3c37ad06 100644
--- a/benchmarks/torchbench_model.py
+++ b/benchmarks/torchbench_model.py
@@ -255,7 +255,7 @@ def set_up(self):
     if self.benchmark_experiment.xla:
       # First, move the model and the inputs to CPU.
       # This avoids having dupplicated data on CUDA.
-      keep_model_data_on_cuda = self.benchmark_experiment.xla_take_cuda_model_and_data
+      keep_model_data_on_cuda = self.benchmark_experiment.keep_model_data_on_cuda
       if self.is_accelerator_cuda() and not keep_model_data_on_cuda:
         self.module = self.module.to("cpu")
         self.example_inputs = move_to_device(self.example_inputs, "cpu")
diff --git a/test/benchmarks/test_benchmark_experiment.py b/test/benchmarks/test_benchmark_experiment.py
index 3d8e85e0b360..afc3419e8b87 100644
--- a/test/benchmarks/test_benchmark_experiment.py
+++ b/test/benchmarks/test_benchmark_experiment.py
@@ -6,8 +6,8 @@
 class BenchmarkExperimentTest(unittest.TestCase):
 
   def test_to_dict(self):
-    be = BenchmarkExperiment("cpu", "PJRT", "some xla_flags", "openxla", None, False,
-                             "train", "123")
+    be = BenchmarkExperiment("cpu", "PJRT", "some xla_flags", "openxla", None,
+                             False, "train", "123")
     actual = be.to_dict()
     self.assertEqual(9, len(actual))
     self.assertEqual("cpu", actual["accelerator"])
@@ -16,7 +16,7 @@ def test_to_dict(self):
     self.assertEqual("some xla_flags", actual["xla_flags"])
     self.assertEqual("openxla", actual["dynamo"])
     self.assertEqual(None, actual["torch_xla2"])
-    self.assertEqual(False, actual["xla_take_cuda_model_and_data"])
+    self.assertEqual(False, actual["keep_model_data_on_cuda"])
     self.assertEqual("train", actual["test"])
     self.assertEqual("123", actual["batch_size"])
 
diff --git a/test/benchmarks/test_experiment_runner.py b/test/benchmarks/test_experiment_runner.py
index 9459388dee77..d34cd33fc6fc 100644
--- a/test/benchmarks/test_experiment_runner.py
+++ b/test/benchmarks/test_experiment_runner.py
@@ -29,10 +29,10 @@ def test_dummy_dry_run(self):
     expected_in_stderr = [
         "Number of selected experiment configs: 4",
         "Number of selected model configs: 1",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"eval\", \"xla_take_cuda_model_and_data\": false}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"train\", \"xla_take_cuda_model_and_data\": false}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"eval\", \"xla_take_cuda_model_and_data\": false}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"train\", \"xla_take_cuda_model_and_data\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"eval\", \"keep_model_data_on_cuda\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"train\", \"keep_model_data_on_cuda\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"eval\", \"keep_model_data_on_cuda\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"train\", \"keep_model_data_on_cuda\": false}",
     ]
     for expected in expected_in_stderr:
       self.assertIn(expected, child.stderr)
@@ -57,10 +57,10 @@ def test_dummy_dry_run_cuda(self):
     expected_in_stderr = [
         "Number of selected experiment configs: 4",
         "Number of selected model configs: 1",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"eval\", \"xla_take_cuda_model_and_data\": false}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"train\", \"xla_take_cuda_model_and_data\": false}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"eval\", \"xla_take_cuda_model_and_data\": false}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"train\", \"xla_take_cuda_model_and_data\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"eval\", \"keep_model_data_on_cuda\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"train\", \"keep_model_data_on_cuda\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"eval\", \"keep_model_data_on_cuda\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"train\", \"keep_model_data_on_cuda\": false}",
     ]
     for expected in expected_in_stderr:
       self.assertIn(expected, child.stderr)
@@ -85,8 +85,8 @@ def test_dummy_dry_run_inductor_cuda(self):
     expected_in_stderr = [
         "Number of selected experiment configs: 2",
         "Number of selected model configs: 1",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"eval\", \"xla_take_cuda_model_and_data\": false}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"train\", \"xla_take_cuda_model_and_data\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"eval\", \"keep_model_data_on_cuda\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"train\", \"keep_model_data_on_cuda\": false}",
     ]
     for expected in expected_in_stderr:
       self.assertIn(expected, child.stderr)
@@ -113,11 +113,11 @@ def test_dummy_openxla_eval_train_cuda(self):
     expected_in_stderr = [
         "Number of selected experiment configs: 5",
         "Number of selected model configs: 1",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla_eval\", \"torch_xla2\": null, \"test\": \"eval\", \"xla_take_cuda_model_and_data\": false}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"train\", \"xla_take_cuda_model_and_data\": false}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"eval\", \"xla_take_cuda_model_and_data\": false}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"eval\", \"xla_take_cuda_model_and_data\": false}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"train\", \"xla_take_cuda_model_and_data\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla_eval\", \"torch_xla2\": null, \"test\": \"eval\", \"keep_model_data_on_cuda\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"train\", \"keep_model_data_on_cuda\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"eval\", \"keep_model_data_on_cuda\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"eval\", \"keep_model_data_on_cuda\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"train\", \"keep_model_data_on_cuda\": false}",
     ]
     for expected in expected_in_stderr:
       self.assertIn(expected, child.stderr)
@@ -139,21 +139,21 @@ def test_dummy_dynamo_none_cuda(self):
     expected_in_stderr = [
         "Number of selected experiment configs: 9",
         "Number of selected model configs: 1",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": null, \"torch_xla2\": null, \"test\": \"eval\", \"xla_take_cuda_model_and_data\": false}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": null, \"torch_xla2\": null, \"test\": \"train\", \"xla_take_cuda_model_and_data\": false}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla_eval\", \"torch_xla2\": null, \"test\": \"eval\", \"xla_take_cuda_model_and_data\": false}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"eval\", \"xla_take_cuda_model_and_data\": false}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"train\", \"xla_take_cuda_model_and_data\": false}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": null, \"torch_xla2\": null, \"test\": \"eval\", \"xla_take_cuda_model_and_data\": false}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": null, \"torch_xla2\": null, \"test\": \"train\", \"xla_take_cuda_model_and_data\": false}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"eval\", \"xla_take_cuda_model_and_data\": false}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"train\", \"xla_take_cuda_model_and_data\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": null, \"torch_xla2\": null, \"test\": \"eval\", \"keep_model_data_on_cuda\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": null, \"torch_xla2\": null, \"test\": \"train\", \"keep_model_data_on_cuda\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla_eval\", \"torch_xla2\": null, \"test\": \"eval\", \"keep_model_data_on_cuda\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"eval\", \"keep_model_data_on_cuda\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"train\", \"keep_model_data_on_cuda\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": null, \"torch_xla2\": null, \"test\": \"eval\", \"keep_model_data_on_cuda\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": null, \"torch_xla2\": null, \"test\": \"train\", \"keep_model_data_on_cuda\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"eval\", \"keep_model_data_on_cuda\": false}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"train\", \"keep_model_data_on_cuda\": false}",
     ]
     for expected in expected_in_stderr:
       self.assertIn(expected, child.stderr)
 
   @absltest.skipUnless(xr.device_type() in {'CUDA'}, 'Needs CUDA accelerator')
-  def test_dummy_dry_run_cuda_with_xla_take_cuda_model_and_data(self):
+  def test_dummy_dry_run_cuda_with_keep_model_data_on_cuda(self):
     child = subprocess.run([
         "python",
         EXPERIMENT_RUNNER_PY,
@@ -171,8 +171,8 @@ def test_dummy_dry_run_cuda_with_xla_take_cuda_model_and_data(self):
     expected_in_stderr = [
         "Number of selected experiment configs: 2",
         "Number of selected model configs: 1",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"eval\", \"xla_take_cuda_model_and_data\": true}",
-        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"train\", \"xla_take_cuda_model_and_data\": true}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"eval\", \"keep_model_data_on_cuda\": true}",
+        "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"train\", \"keep_model_data_on_cuda\": true}",
     ]
     for expected in expected_in_stderr:
       self.assertIn(expected, child.stderr)

From ffc9c6d6e174a287ff4428354e5c06991e6a8739 Mon Sep 17 00:00:00 2001
From: iefgnoix <isaacwxf23@gmail.com>
Date: Wed, 5 Jun 2024 22:01:41 +0000
Subject: [PATCH 7/8] fix a test

---
 test/benchmarks/test_experiment_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/benchmarks/test_experiment_runner.py b/test/benchmarks/test_experiment_runner.py
index d34cd33fc6fc..0f107afbbfdc 100644
--- a/test/benchmarks/test_experiment_runner.py
+++ b/test/benchmarks/test_experiment_runner.py
@@ -163,7 +163,7 @@ def test_dummy_dry_run_cuda_with_keep_model_data_on_cuda(self):
         "--test=train",
         "--suite-name=dummy",
         "--accelerator=cuda",
-        "--xla-take-cuda-model-and-data",
+        "--keep-model-data-on-cuda",
         "--dry-run",
     ],
                            capture_output=True,

From 3f347df20ed1c16bfc000106119f3c23526e3be4 Mon Sep 17 00:00:00 2001
From: iefgnoix <isaacwxf23@gmail.com>
Date: Wed, 5 Jun 2024 23:36:10 +0000
Subject: [PATCH 8/8] fix the result_analyzer

---
 benchmarks/README.md          |  6 ++--
 benchmarks/result_analyzer.py | 52 +++++++++++++++++++++++++----------
 2 files changed, 40 insertions(+), 18 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index a407ae2cef13..71d6b63d139a 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -46,7 +46,7 @@ parent of the `xla` directory.
 
 The following example runs the alexnet benchmark on GPU through the
 Pytorch/XLA-dynamo path and through the Inductor-dynamo with 5 repetitions each.
-The results will be stored in a json file in `experiment_results`.
+The results will be stored in a json file (eg results.jsonl) in `experiment_results`.
 
 ```
 cd pytorch
@@ -132,8 +132,8 @@ PT/XLA, and compare it against some basline.
 Run the `result_analyzer.py` from the `pytorch` directory, which should be the
 parent of the `xla` directory.
 
-The following example analyzes the results generated by the above invocation of
-`experiment_runner.py`. The aggregates are saved in CSV format in
+The following example analyzes the results (eg results.jsonl) generated by the above invocation of
+`experiment_runner.py`. So make sure to use consistent `--output-dirname` parameter. The aggregates are saved in CSV format in
 `experiment_results/metric_report.csv`.
 
 ```
diff --git a/benchmarks/result_analyzer.py b/benchmarks/result_analyzer.py
index 5b0cf7f4cd3e..fa861510e805 100644
--- a/benchmarks/result_analyzer.py
+++ b/benchmarks/result_analyzer.py
@@ -56,6 +56,7 @@ def run_csv(self):
         "xla_flags": pd.Series(dtype="str"),
         "dynamo": pd.Series(dtype="str"),
         "torch_xla2": pd.Series(dtype="str"),
+        "keep_model_data_on_cuda": pd.Series(dtype="bool"),
         "test": pd.Series(dtype="str"),
         "batch_size": pd.Series(dtype="int"),
         "repeat": pd.Series(dtype="int"),
@@ -119,6 +120,9 @@ def extract_metrics_jsonl(self, file):
       dynamo_value = "None" if dynamo is None else dynamo
       torch_xla2 = dataline["experiment"]["torch_xla2"]
       torch_xla2_value = "None" if torch_xla2 is None else torch_xla2
+      keep_model_data_on_cuda = dataline["experiment"][
+          "keep_model_data_on_cuda"]
+      keep_model_data_on_cuda_value = "None" if keep_model_data_on_cuda is None else keep_model_data_on_cuda
       test = dataline["experiment"]["test"]
       test_value = "None" if test is None else test
       outputs_file = dataline["experiment"].get("outputs_file", None)
@@ -139,6 +143,7 @@ def extract_metrics_jsonl(self, file):
               "xla": xla_value,
               "dynamo": dynamo_value,
               "torch_xla2": torch_xla2_value,
+              "keep_model_data_on_cuda": keep_model_data_on_cuda_value,
               "test": test_value,
               "outputs_file": outputs_file_value
           }
@@ -171,21 +176,38 @@ def extract_metrics_csv(self, file, metric_df):
       timestamp = dataline[
           "timestamp"] if "timestamp" in dataline else self.timestamp
       d = {
-          "timestamp": timestamp,
-          "suite_name": dataline["model"]["suite_name"],
-          "model_name": dataline["model"]["model_name"],
-          "accelerator": dataline["experiment"]["accelerator"],
-          "accelerator_model": dataline["experiment"]["accelerator_model"],
-          "xla": dataline["experiment"]["xla"],
-          "xla_flags": dataline["experiment"]["xla_flags"],
-          "dynamo": dataline["experiment"]["dynamo"],
-          "torch_xla2": dataline["experiment"]["torch_xla2"],
-          "test": dataline["experiment"]["test"],
-          "batch_size": dataline["experiment"]["batch_size"],
-          "repeat": dataline["repeat"],
-          "iterations_per_run": dataline["iterations_per_run"],
-          "error_message": None,
-          "outputs_file": dataline["experiment"].get("outputs_file", ""),
+          "timestamp":
+              timestamp,
+          "suite_name":
+              dataline["model"]["suite_name"],
+          "model_name":
+              dataline["model"]["model_name"],
+          "accelerator":
+              dataline["experiment"]["accelerator"],
+          "accelerator_model":
+              dataline["experiment"]["accelerator_model"],
+          "xla":
+              dataline["experiment"]["xla"],
+          "xla_flags":
+              dataline["experiment"]["xla_flags"],
+          "dynamo":
+              dataline["experiment"]["dynamo"],
+          "torch_xla2":
+              dataline["experiment"]["torch_xla2"],
+          "keep_model_data_on_cuda":
+              dataline["experiment"]["keep_model_data_on_cuda"],
+          "test":
+              dataline["experiment"]["test"],
+          "batch_size":
+              dataline["experiment"]["batch_size"],
+          "repeat":
+              dataline["repeat"],
+          "iterations_per_run":
+              dataline["iterations_per_run"],
+          "error_message":
+              None,
+          "outputs_file":
+              dataline["experiment"].get("outputs_file", ""),
       }
 
       if "error" in dataline["metrics"] and not self._args.hide_errors: