diff --git a/.gitignore b/.gitignore
index 2ba9c1e6..900e2ae8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,6 +14,8 @@
 *.weight
 *.nsys-rep
 *.pkl
+*.pt
+*.pt2
 *.xlsx
 *.sarif
 *.sqlitest
diff --git a/CHANGELOGS.rst b/CHANGELOGS.rst
index 73d274b6..96ba2c91 100644
--- a/CHANGELOGS.rst
+++ b/CHANGELOGS.rst
@@ -1,6 +1,11 @@
 Change Logs
 ===========
 
+0.8.3
++++++
+
+* :pr:`304`: improves side-by-side comparison
+
 0.8.2
 +++++
 
diff --git a/_doc/index.rst b/_doc/index.rst
index b1f7a98b..c41e655a 100644
--- a/_doc/index.rst
+++ b/_doc/index.rst
@@ -239,8 +239,8 @@ The function replaces dynamic dimensions defined as strings by
 Older versions
 ==============
 
+* `0.8.3 <../v0.8.3/index.html>`_
 * `0.8.2 <../v0.8.2/index.html>`_
-* `0.8.1 <../v0.8.1/index.html>`_
 * `0.7.16 <../v0.7.16/index.html>`_
 * `0.6.3 <../v0.6.3/index.html>`_
 * `0.5.0 <../v0.5.0/index.html>`_
diff --git a/_unittests/ut_export/test_api.py b/_unittests/ut_export/test_api.py
index 76078acd..c1fa8b09 100644
--- a/_unittests/ut_export/test_api.py
+++ b/_unittests/ut_export/test_api.py
@@ -110,6 +110,8 @@ def test_tiny_llm_to_onnx(self):
                 diff = max_diff(expected, got)
                 assert diff["abs"] <= 1e-5, f"diff={diff}"
 
+        self.clean_dump()
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)
diff --git a/_unittests/ut_helpers/test_bench_run.py b/_unittests/ut_helpers/test_bench_run.py
index 12dbae6b..5e831826 100644
--- a/_unittests/ut_helpers/test_bench_run.py
+++ b/_unittests/ut_helpers/test_bench_run.py
@@ -109,35 +109,51 @@ def test_make_configs_replace(self):
     def test_max_diff(self):
         self.assertEqual(
             max_diff(torch.Tensor([1, 2]), torch.Tensor([1, 2])),
-            {"abs": 0.0, "rel": 0.0, "sum": 0.0, "n": 2.0, "dnan": 0.0, "argm": (0,)},
+            {
+                "abs": 0.0,
+                "rel": 0.0,
+                "sum": 0.0,
+                "n": 2.0,
+                "dnan": 0.0,
+                "argm": (0,),
+                "dev": 0,
+            },
         )
         self.assertEqual(
             max_diff(
                 (torch.Tensor([1, 2]),),
                 (torch.Tensor([1, 2])),
             ),
-            {"abs": 0.0, "rel": 0.0, "sum": 0.0, "n": 2.0, "dnan": 0.0, "argm": (0,)},
+            {
+                "abs": 0.0,
+                "rel": 0.0,
+                "sum": 0.0,
+                "n": 2.0,
+                "dnan": 0.0,
+                "argm": (0,),
+                "dev": 0,
+            },
         )
         self.assertEqual(
             max_diff(
                 (torch.Tensor([1, 2]), (torch.Tensor([1, 2]),)),
                 (torch.Tensor([1, 2]), (torch.Tensor([1, 2]),)),
             ),
-            {"abs": 0.0, "rel": 0.0, "sum": 0.0, "n": 4.0, "dnan": 0.0},
+            {"abs": 0.0, "rel": 0.0, "sum": 0.0, "n": 4.0, "dnan": 0.0, "dev": 0},
         )
         self.assertEqual(
             max_diff(
                 {"a": torch.Tensor([1, 2])},
                 {"a": torch.Tensor([1, 2])},
             ),
-            {"abs": 0.0, "rel": 0.0, "sum": 0.0, "n": 2.0, "dnan": 0.0},
+            {"abs": 0.0, "rel": 0.0, "sum": 0.0, "n": 2.0, "dnan": 0.0, "dev": 0},
         )
         self.assertEqual(
             max_diff(
                 {"a": torch.Tensor([1, 2])},
                 [torch.Tensor([1, 2])],
             ),
-            {"abs": 0.0, "rel": 0.0, "sum": 0.0, "n": 2.0, "dnan": 0.0},
+            {"abs": 0.0, "rel": 0.0, "sum": 0.0, "n": 2.0, "dnan": 0.0, "dev": 0},
         )
         self.assertEqual(
             max_diff(
@@ -150,6 +166,7 @@ def test_max_diff(self):
                 "n": 2.0,
                 "rel": 0.9999999997999001,
                 "sum": 9999999998.0,
+                "dev": 0,
             },
         )
 
@@ -164,7 +181,9 @@ def test_max_diff_dynamic_cache(self):
             flatten=True,
             verbose=10,
         )
-        self.assertEqual(md, {"abs": 0.0, "rel": 0.0, "sum": 0.0, "n": 10.0, "dnan": 0})
+        self.assertEqual(
+            md, {"abs": 0.0, "rel": 0.0, "sum": 0.0, "n": 10.0, "dnan": 0, "dev": 0}
+        )
 
 
 if __name__ == "__main__":
diff --git a/_unittests/ut_helpers/test_helper.py b/_unittests/ut_helpers/test_helper.py
index c9890e98..29a279d7 100644
--- a/_unittests/ut_helpers/test_helper.py
+++ b/_unittests/ut_helpers/test_helper.py
@@ -11,6 +11,7 @@
     hide_stdout,
     requires_onnx,
     requires_transformers,
+    requires_cuda,
 )
 from onnx_diagnostic.helpers.helper import (
     string_type,
@@ -199,6 +200,31 @@ def test_flatten(self):
         d = string_diff(diff)
         self.assertIsInstance(d, str)
 
+    @hide_stdout()
+    def test_maxdiff_device(self):
+        inputs = (torch.arange(2), torch.cos(torch.arange(3)))
+        diff = max_diff(inputs, inputs, flatten=True, verbose=10)
+        self.assertEqual(diff["abs"], 0)
+        self.assertEqual(diff["dev"], 0)
+
+    @hide_stdout()
+    @requires_cuda()
+    def test_maxdiff_device_cuda(self):
+        diff = max_diff(torch.ones((2,)).cuda(), torch.ones((2,)), verbose=10)
+        self.assertEqual(diff["dev"], 1)
+        inputs = (torch.arange(2), torch.cos(torch.arange(3)))
+        inputs2 = (inputs[0].cuda(), inputs[1].cuda())
+        diff = max_diff(inputs, inputs2, verbose=10)
+        self.assertEqual(diff["abs"], 0)
+        self.assertEqual(diff["dev"], 2)
+        inputs2 = (inputs[0], inputs[1].cuda())
+        diff = max_diff(inputs, inputs2, verbose=10)
+        self.assertEqual(diff["abs"], 0)
+        self.assertEqual(diff["dev"], 1)
+        diff = max_diff(inputs2, inputs2, verbose=10)
+        self.assertEqual(diff["abs"], 0)
+        self.assertEqual(diff["dev"], 0)
+
     def test_flatten_cache(self):
         cache = make_dynamic_cache([(torch.ones((5, 6, 5, 6)), torch.ones((5, 6, 5, 6)) + 2)])
         flat = flatten_object(cache, drop_keys=True)
diff --git a/_unittests/ut_helpers/test_log_helper.py b/_unittests/ut_helpers/test_log_helper.py
index 96fc88d7..df8af469 100644
--- a/_unittests/ut_helpers/test_log_helper.py
+++ b/_unittests/ut_helpers/test_log_helper.py
@@ -189,6 +189,7 @@ def test_cube_logs_excel(self):
             verbose=1,
         )
         self.assertExists(output)
+        self.clean_dump()
 
     @hide_stdout()
     def test_enumerate_csv_files(self):
@@ -210,6 +211,7 @@ def test_enumerate_csv_files(self):
         cube.load(verbose=1)
         self.assertEqual((3, 11), cube.shape)
         self.assertIn("RAWFILENAME", cube.data.columns)
+        self.clean_dump()
 
     def test_cube_logs_performance1(self):
         output = self.get_dump_file("test_cube_logs_performance1.xlsx")
@@ -235,6 +237,7 @@ def test_cube_logs_performance1(self):
             ],
         )
         self.assertExists(output)
+        self.clean_dump()
 
     def test_cube_logs_performance2(self):
         output = self.get_dump_file("test_cube_logs_performance2.xlsx")
@@ -470,6 +473,7 @@ def test_historical_cube_time_mask(self):
         )
         cube = CubeLogs(df, keys=["^m_*", "exporter"], time="date").load()
         cube.to_excel(output, views=["time_p"], time_mask=True, verbose=1)
+        self.clean_dump()
 
     def test_cube_sbs_no_time(self):
         df = pandas.DataFrame(
@@ -532,6 +536,7 @@ def test_cube_sbs_no_time(self):
             verbose=0,
             sbs=dict(CFA=dict(exporter="E1", opt="O"), CFB=dict(exporter="E2", opt="O")),
         )
+        self.clean_dump()
 
     def test_cube_sbs_with_time(self):
         df = pandas.DataFrame(
diff --git a/_unittests/ut_helpers/test_ort_session.py b/_unittests/ut_helpers/test_ort_session.py
index 0ec5af44..87d5f3df 100644
--- a/_unittests/ut_helpers/test_ort_session.py
+++ b/_unittests/ut_helpers/test_ort_session.py
@@ -310,6 +310,7 @@ def test_profiling(self):
         got = wrap.run(None, feeds)
         self.assertIsInstance(got[0], torch.Tensor)
         self.assertEqualArray(expected[0], got[0])
+        self.clean_dump()
 
 
 if __name__ == "__main__":
diff --git a/_unittests/ut_helpers/test_torch_helper.py b/_unittests/ut_helpers/test_torch_helper.py
index bc0a29b8..9441e425 100644
--- a/_unittests/ut_helpers/test_torch_helper.py
+++ b/_unittests/ut_helpers/test_torch_helper.py
@@ -151,6 +151,7 @@ def forward(self, x, y):
         self.assertEqualAny(restored["main", 1, "I"], (inputs, {}))
         self.assertEqualAny(restored["main", 0, "O"], res1)
         self.assertEqualAny(restored["main", 0, "O"], res2)
+        self.clean_dump()
 
     @hide_stdout()
     def test_steal_forward_dump_file_steal_append(self):
@@ -181,6 +182,7 @@ def forward(self, x, y):
             {("", 1, "I"), ("", 1, "O"), "sx", ("", 0, "O"), "sx_1", ("", 0, "I")},
             set(restored),
         )
+        self.clean_dump()
 
     @hide_stdout()
     def test_steal_forward_dump_file_steal_append_drop(self):
@@ -214,6 +216,7 @@ def forward(self, x, y):
         first = restored[("", 0, "I")]
         _a, kws = first
         self.assertNotIn("x", kws)
+        self.clean_dump()
 
     @hide_stdout()
     def test_steal_forward_submodules(self):
@@ -257,6 +260,7 @@ def forward(self, x, y):
             ),
             len(sorted(restored)),
         )
+        self.clean_dump()
 
     def test_replace_string_by_dynamic(self):
         example = {
diff --git a/_unittests/ut_reference/test_onnxruntime_evaluator.py b/_unittests/ut_reference/test_onnxruntime_evaluator.py
index 738b0ea3..8454a9c4 100644
--- a/_unittests/ut_reference/test_onnxruntime_evaluator.py
+++ b/_unittests/ut_reference/test_onnxruntime_evaluator.py
@@ -1,10 +1,11 @@
 import unittest
+from typing import Optional
 import numpy as np
 import onnx
 import onnx.helper as oh
 import torch
 import onnxruntime
-from onnx_diagnostic.ext_test_case import ExtTestCase, hide_stdout
+from onnx_diagnostic.ext_test_case import ExtTestCase, hide_stdout, ignore_warnings
 from onnx_diagnostic.helpers.onnx_helper import from_array_extended
 from onnx_diagnostic.reference import (
     OnnxruntimeEvaluator,
@@ -22,6 +23,14 @@
 
 
 class TestOnnxruntimeEvaluator(ExtTestCase):
+    def _range(self, *shape, bias: Optional[float] = None):
+        n = np.prod(shape)
+        x = np.arange(n).astype(np.float32) / n
+        if bias:
+            x = x + bias
+        return x.reshape(tuple(shape)).astype(np.float32)
+
+    @ignore_warnings(FutureWarning)
     def test_ort_eval_scan_cdist_add(self):
 
         def dist(unused: torch.Tensor, x: torch.Tensor, samex: torch.Tensor):
@@ -69,6 +78,7 @@ def forward(self, x):
         got = orte.run(None, {name: x.numpy()})[0]
         self.assertEqualArray(expected, got)
 
+    @ignore_warnings((UserWarning, FutureWarning))
     def test_ort_eval_cond(self):
         import torch
 
@@ -180,6 +190,7 @@ def test_constant_bool_input(self):
         self.assertEqual(got.dtype, torch.bool)
         self.assertEqual(got[0], True)
 
+    @hide_stdout()
     def test_ort_eval_loop(self):
         model = torch.nn.EmbeddingBag(num_embeddings=49157, embedding_dim=32, mode="sum")
         a = torch.tensor([[39906, 39906]]).long()
@@ -226,6 +237,28 @@ def test_report_results_comparison_ort(self):
         self.assertLess(d[(0, "nx"), "r_cos"], 1e-6)
         self.assertLess(d[(2, "u"), "r_exp"], 1e-6)
 
+    @hide_stdout()
+    def test_skip_layer_normalization(self):
+        node = oh.make_node(
+            "SkipLayerNormalization",
+            ["x", "skip", "beta", "gamma", "bias"],
+            ["Z"],
+            epsilon=1.0e-5,
+            domain="com.microsoft",
+        )
+        feeds = dict(
+            x=self._range(2, 3, 8),
+            skip=self._range(2, 3, 8, bias=3),
+            beta=self._range(8, bias=1),
+            gamma=self._range(8, bias=2),
+            bias=self._range(8, bias=0.1),
+        )
+        ref = ExtendedReferenceEvaluator(node)
+        expected = ref.run(None, feeds)
+        rt = OnnxruntimeEvaluator(node, verbose=10, opsets={"": 22})
+        got = rt.run(None, feeds)
+        self.assertEqualAny(expected, got, atol=1e-4)
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)
diff --git a/_unittests/ut_tasks/try_export.py b/_unittests/ut_tasks/try_export.py
index 75f7e173..0397c310 100644
--- a/_unittests/ut_tasks/try_export.py
+++ b/_unittests/ut_tasks/try_export.py
@@ -86,6 +86,8 @@ def _config_reduction(config, task):
             hidden_states=torch.rand((1292, 1176), dtype=torch_dtype).to(device),
             grid_thw=torch.tensor([[1, 34, 38]], dtype=torch.int64).to(device),
         )
+        print("-- save inputs")
+        torch.save(inputs, self.get_dump_file("qwen_2_5_vl_instruct_visual.inputs.pt"))
 
         print(f"-- inputs: {self.string_type(inputs, with_shape=True)}")
         # this is too long
@@ -120,7 +122,7 @@ def _config_reduction(config, task):
                 filename=filename,
                 exporter=exporter,
                 verbose=1,
-                save_ep=fileep,
+                save_ep=(fileep, 2**35),
                 target_opset=22,
                 optimize=True,
             )
diff --git a/_unittests/ut_torch_export_patches/test_patch_torch.py b/_unittests/ut_torch_export_patches/test_patch_torch.py
index 74afca7d..1bcdd337 100644
--- a/_unittests/ut_torch_export_patches/test_patch_torch.py
+++ b/_unittests/ut_torch_export_patches/test_patch_torch.py
@@ -510,7 +510,7 @@ def _batch1(t):
             got = ep.module()(**torch_deepcopy(inputs))
             self.assertEqualArrayAny(expected, got)
 
-    @requires_torch("2.9", "Eq(s3, Max(s10, s3)) is inconsistent!")
+    @requires_torch("2.11", "Eq(s3, Max(s10, s3)) is inconsistent!, until we know more")
     def test_patch_tiny_llm_dim_meta_level_1(self):
         class Model(torch.nn.Module):
             def forward(self, x, ind1, ind2):
diff --git a/_unittests/ut_torch_export_patches/test_patch_transformers.py b/_unittests/ut_torch_export_patches/test_patch_transformers.py
index 5bc3d0ba..bf96ffcd 100644
--- a/_unittests/ut_torch_export_patches/test_patch_transformers.py
+++ b/_unittests/ut_torch_export_patches/test_patch_transformers.py
@@ -464,6 +464,7 @@ def forward(
                 atol=1e-3,
                 rtol=1,
             )
+        self.clean_dump()
 
     @requires_transformers("4.99")
     @requires_torch("2.9.99")
@@ -508,6 +509,7 @@ def test_qwen2_5_vl_vision_attention_iteration(self):
                 atol=1e-3,
                 rtol=1,
             )
+        self.clean_dump()
 
 
 if __name__ == "__main__":
diff --git a/_unittests/ut_torch_models/test_hghub_mode_rewrite.py b/_unittests/ut_torch_models/test_hghub_mode_rewrite.py
index 1dcdca82..24907bb0 100644
--- a/_unittests/ut_torch_models/test_hghub_mode_rewrite.py
+++ b/_unittests/ut_torch_models/test_hghub_mode_rewrite.py
@@ -25,8 +25,8 @@ def test_export_rewriting_bart(self):
         data = get_untrained_model_with_inputs(mid, verbose=1)
         model, inputs, ds = data["model"], data["inputs"], data["dynamic_shapes"]
         dump_folder = self.get_dump_file("test_export_rewritin_bart")
-        print(self.string_type(inputs))
-        print(self.string_type(ds))
+        print("--", self.string_type(inputs))
+        print("--", self.string_type(ds))
         with torch_export_patches(
             patch_transformers=True, rewrite=model, dump_rewriting=dump_folder
         ):
diff --git a/_unittests/ut_torch_models/test_validate_models.py b/_unittests/ut_torch_models/test_validate_models.py
index f2caa5e0..7f0138ee 100644
--- a/_unittests/ut_torch_models/test_validate_models.py
+++ b/_unittests/ut_torch_models/test_validate_models.py
@@ -43,10 +43,11 @@ def test_validate_tiny_llms_bfloat16(self):
         )
         self.assertLess(summary["disc_onnx_ort_run_abs"], 2e-2)
         self.assertIn("onnx_filename", data)
+        self.clean_dump()
 
     @unittest.skipIf(torch29_and_tr_main, "combination not working")
-    @requires_transformers("4.53")
-    @requires_torch("2.8.99")
+    @requires_transformers("4.57")  # 4.53 works for some jobs fails due to no space left
+    @requires_torch("2.9.99")  # 2.9 works for some jobs fails due to no space left
     @requires_experimental()
     @hide_stdout()
     def test_validate_microsoft_phi4_reasoning(self):
@@ -65,6 +66,7 @@ def test_validate_microsoft_phi4_reasoning(self):
         )
         self.assertLess(summary["disc_onnx_ort_run_abs"], 2e-5)
         self.assertIn("onnx_filename", data)
+        self.clean_dump()
 
     @unittest.skipIf(torch29_and_tr_main, "combination not working")
     @requires_transformers("4.53")
@@ -87,6 +89,7 @@ def test_validate_microsoft_phi3_mini_128k(self):
         )
         self.assertLess(summary["disc_onnx_ort_run_abs"], 2e-5)
         self.assertIn("onnx_filename", data)
+        self.clean_dump()
 
 
 if __name__ == "__main__":
diff --git a/_unittests/ut_torch_models/test_validate_whole_models1.py b/_unittests/ut_torch_models/test_validate_whole_models1.py
index dafa4297..d1924e08 100644
--- a/_unittests/ut_torch_models/test_validate_whole_models1.py
+++ b/_unittests/ut_torch_models/test_validate_whole_models1.py
@@ -50,6 +50,7 @@ def test_c_validate_model(self):
         self.assertIsInstance(summary, dict)
         self.assertIsInstance(data, dict)
         validate_model(mid, do_run=True, verbose=2, quiet=True)
+        self.clean_dump()
 
     @hide_stdout()
     def test_d_validate_model_dtype(self):
@@ -60,6 +61,7 @@ def test_d_validate_model_dtype(self):
         self.assertIsInstance(summary, dict)
         self.assertIsInstance(data, dict)
         validate_model(mid, do_run=True, verbose=2, quiet=True)
+        self.clean_dump()
 
     @hide_stdout()
     def test_e_validate_model_export(self):
@@ -74,6 +76,7 @@ def test_e_validate_model_export(self):
         )
         self.assertIsInstance(summary, dict)
         self.assertIsInstance(data, dict)
+        self.clean_dump()
 
     @requires_torch("2.10.99")
     @requires_transformers("4.54")
@@ -99,6 +102,7 @@ def test_f_validate_model_onnx_dynamo_ir(self):
         run_ort_fusion(
             onnx_filename, output_path, num_attention_heads=2, hidden_size=192, verbose=10
         )
+        self.clean_dump()
 
     @requires_torch("2.7")
     @requires_onnxscript("0.7")
@@ -122,6 +126,7 @@ def test_g_validate_model_onnx_dynamo_os_ort(self):
         self.assertLess(summary["disc_onnx_ort_run2_batch1_abs"], 1e-4)
         onnx_filename = data["onnx_filename"]
         self.assertExists(onnx_filename)
+        self.clean_dump()
 
     @requires_torch("2.7")
     @hide_stdout()
@@ -152,6 +157,7 @@ def test_i_validate_model_custom(self):
         run_ort_fusion(
             onnx_filename, output_path, num_attention_heads=2, hidden_size=192, verbose=10
         )
+        self.clean_dump()
 
     @requires_torch("2.7")
     @hide_stdout()
@@ -176,6 +182,7 @@ def test_j_validate_model_custom_torch(self):
         self.assertIsInstance(data, dict)
         self.assertIn("disc_onnx_ort_run_abs", summary)
         self.assertLess(summary["disc_onnx_ort_run_abs"], 1e-4)
+        self.clean_dump()
 
     def test_k_filter_inputs(self):
         inputs, ds = {"a": 1, "b": 2}, {"a": 20, "b": 30}
@@ -222,6 +229,7 @@ def test_n_validate_phi35_mini_instruct(self):
         onx = onnx.load(onnx_filename)
         op_types = set(n.op_type for n in onx.graph.node)
         self.assertIn("If", op_types)
+        self.clean_dump()
 
 
 if __name__ == "__main__":
diff --git a/_unittests/ut_torch_models/test_validate_whole_models2.py b/_unittests/ut_torch_models/test_validate_whole_models2.py
index 3f0ad51a..bbedacd7 100644
--- a/_unittests/ut_torch_models/test_validate_whole_models2.py
+++ b/_unittests/ut_torch_models/test_validate_whole_models2.py
@@ -41,6 +41,7 @@ def test_o_validate_phi35_4k_mini_instruct(self):
         onx = onnx.load(onnx_filename)
         op_types = set(n.op_type for n in onx.graph.node)
         self.assertIn("If", op_types)
+        self.clean_dump()
 
 
 if __name__ == "__main__":
diff --git a/_unittests/ut_torch_models/test_validate_whole_models3.py b/_unittests/ut_torch_models/test_validate_whole_models3.py
index 106cfadd..419dbe13 100644
--- a/_unittests/ut_torch_models/test_validate_whole_models3.py
+++ b/_unittests/ut_torch_models/test_validate_whole_models3.py
@@ -34,6 +34,7 @@ def test_l_validate_model_modelbuilder(self):
         self.assertLess(summary["disc_onnx_ort_run_abs"], 3e-2)
         onnx_filename = data["onnx_filename"]
         self.assertExists(onnx_filename)
+        self.clean_dump()
 
     @requires_torch("2.7")
     @hide_stdout()
@@ -59,6 +60,7 @@ def test_m_validate_model_vit_model(self):
         self.assertEqual("#1[A1s3x2]", summary["run_output_inputs2"])
         onnx_filename = data["onnx_filename"]
         self.assertExists(onnx_filename)
+        self.clean_dump()
 
 
 if __name__ == "__main__":
diff --git a/_unittests/ut_torch_onnx/test_sbs.py b/_unittests/ut_torch_onnx/test_sbs.py
index 63732455..6ae72e94 100644
--- a/_unittests/ut_torch_onnx/test_sbs.py
+++ b/_unittests/ut_torch_onnx/test_sbs.py
@@ -1,29 +1,32 @@
 import unittest
+import pandas
+import onnx
 from onnx_diagnostic.ext_test_case import (
     ExtTestCase,
     hide_stdout,
     ignore_warnings,
     ignore_errors,
+    requires_cuda,
 )
-from onnx_diagnostic.reference import ExtendedReferenceEvaluator
-from onnx_diagnostic.torch_onnx.sbs import run_aligned
-
-try:
-    from experimental_experiment.torch_interpreter import to_onnx
-except ImportError:
-    to_onnx = None
+from onnx_diagnostic.reference import ExtendedReferenceEvaluator, OnnxruntimeEvaluator
+from onnx_diagnostic.torch_export_patches.patch_inputs import use_dyn_not_str
+from onnx_diagnostic.torch_onnx.sbs import run_aligned, post_process_run_aligned_obs
+from onnx_diagnostic.export.api import to_onnx
 
 
 class TestSideBySide(ExtTestCase):
+    @classmethod
+    def setUpClass(cls):
+        import torch
+
+        cls.torch = torch
 
     @hide_stdout()
     @unittest.skipIf(to_onnx is None, "to_onnx not installed")
     @ignore_errors(OSError)  # connectivity issues
     @ignore_warnings((UserWarning,))
     def test_ep_onnx_sync_exp(self):
-        import torch
-
-        class Model(torch.nn.Module):
+        class Model(self.torch.nn.Module):
             def forward(self, x):
                 ry = x.abs()
                 rz = ry.exp()
@@ -31,31 +34,29 @@ def forward(self, x):
                 ru = rw.log() + rw
                 return ru
 
-        x = torch.randn((5, 4))
+        x = self.torch.randn((5, 4))
         Model()(x)
-        ep = torch.export.export(
-            Model(), (x,), dynamic_shapes=({0: torch.export.Dim("batch")},)
+        ep = self.torch.export.export(
+            Model(), (x,), dynamic_shapes=({0: self.torch.export.Dim("batch")},)
         )
-        onx = to_onnx(ep)
+        onx = to_onnx(ep, exporter="custom").model_proto
         results = list(
             run_aligned(
                 ep,
                 onx,
-                (x,),
-                check_conversion_cls=dict(
-                    cls=ExtendedReferenceEvaluator, atol=1e-5, rtol=1e-5
-                ),
+                args=(x,),
+                run_cls=ExtendedReferenceEvaluator,
+                atol=1e-5,
+                rtol=1e-5,
                 verbose=1,
             ),
         )
-        self.assertEqual(len(results), 5)
+        self.assertEqual(len(results), 7)
 
     @hide_stdout()
     @ignore_warnings((DeprecationWarning, FutureWarning, UserWarning))
     def test_ep_onnx_sync_a(self):
-        import torch
-
-        class Model(torch.nn.Module):
+        class Model(self.torch.nn.Module):
             def forward(self, x):
                 ry = x.abs()
                 rz = ry.exp()
@@ -63,25 +64,375 @@ def forward(self, x):
                 ru = rw.log() + rw
                 return ru
 
-        x = torch.randn((5, 4))
+        x = self.torch.randn((5, 4))
         Model()(x)
-        ep = torch.export.export(
-            Model(), (x,), dynamic_shapes=({0: torch.export.Dim("batch")},)
+        ep = self.torch.export.export(
+            Model(), (x,), dynamic_shapes=({0: self.torch.export.Dim("batch")},)
         )
-        epo = torch.onnx.export(ep, (x,), dynamic_shapes=({0: torch.export.Dim("batch")},))
+        onx = to_onnx(
+            ep,
+            (x,),
+            dynamic_shapes=({0: self.torch.export.Dim("batch")},),
+            exporter="onnx-dynamo",
+        ).model_proto
+        results = list(
+            run_aligned(
+                ep,
+                onx,
+                args=(x,),
+                run_cls=ExtendedReferenceEvaluator,
+                atol=1e-5,
+                rtol=1e-5,
+                verbose=1,
+            ),
+        )
+        self.assertEqual(len(results), 6)
+
+    @hide_stdout()
+    @ignore_warnings((DeprecationWarning, FutureWarning, UserWarning))
+    def test_sbs_dict(self):
+        class Model(self.torch.nn.Module):
+            def forward(self, x):
+                ry = x.abs()
+                rz = ry.exp()
+                rw = rz + 1
+                ru = rw.log() + rw
+                return ru
+
+        inputs = dict(x=self.torch.randn((5, 4)))
+        ds = dict(x={0: "batch"})
+        Model()(**inputs)
+        ep = self.torch.export.export(
+            Model(), (), kwargs=inputs, dynamic_shapes=use_dyn_not_str(ds)
+        )
+        epo = to_onnx(Model(), (), kwargs=inputs, dynamic_shapes=ds, exporter="onnx-dynamo")
         onx = epo.model_proto
         results = list(
             run_aligned(
                 ep,
                 onx,
-                (x,),
-                check_conversion_cls=dict(
-                    cls=ExtendedReferenceEvaluator, atol=1e-4, rtol=1e-4
-                ),
+                kwargs=inputs,
+                run_cls=ExtendedReferenceEvaluator,
+                atol=1e-5,
+                rtol=1e-5,
                 verbose=1,
             ),
         )
-        self.assertEqual(len(results), 4)
+        self.assertEqual(len(results), 6)
+
+    @hide_stdout()
+    @ignore_warnings((DeprecationWarning, FutureWarning, UserWarning))
+    def test_sbs_dict_onnxruntime(self):
+        class Model(self.torch.nn.Module):
+            def forward(self, x):
+                ry = x.abs()
+                rz = ry.exp()
+                rw = rz + 1
+                ru = rw.log() + rw
+                return ru
+
+        inputs = dict(x=self.torch.randn((5, 4)))
+        ds = dict(x={0: "batch"})
+        Model()(**inputs)
+        ep = self.torch.export.export(
+            Model(), (), kwargs=inputs, dynamic_shapes=use_dyn_not_str(ds)
+        )
+        onx = to_onnx(ep, exporter="custom").model_proto
+        results = list(
+            run_aligned(
+                ep,
+                onx,
+                kwargs=inputs,
+                run_cls=OnnxruntimeEvaluator,
+                atol=1e-5,
+                rtol=1e-5,
+                verbose=11,
+            ),
+        )
+        self.assertEqual(len(results), 7)
+
+    @hide_stdout()
+    @ignore_warnings((DeprecationWarning, FutureWarning, UserWarning))
+    def test_sbs_dict_tensor(self):
+        class Model(self.torch.nn.Module):
+            def forward(self, x):
+                ry = x.abs()
+                rz = ry.exp()
+                rw = rz + 1
+                ru = rw.log() + rw + ry
+                return ru
+
+        inputs = dict(x=self.torch.randn((5, 4)))
+        ds = dict(x={0: "batch"})
+        Model()(**inputs)
+        ep = self.torch.export.export(
+            Model(), (), kwargs=inputs, dynamic_shapes=use_dyn_not_str(ds)
+        )
+        onx = to_onnx(ep, exporter="custom").model_proto
+        results = list(
+            run_aligned(
+                ep,
+                onx,
+                kwargs=inputs,
+                run_cls=OnnxruntimeEvaluator,
+                atol=1e-5,
+                rtol=1e-5,
+                verbose=11,
+                use_tensor=True,
+            ),
+        )
+        self.assertEqual(len(results), 8)
+        self.clean_dump()
+
+    @hide_stdout()
+    @ignore_warnings((DeprecationWarning, FutureWarning, UserWarning))
+    @requires_cuda()
+    def test_sbs_dict_tensor_cuda(self):
+        class Model(self.torch.nn.Module):
+            def forward(self, x):
+                ry = x.abs()
+                rz = ry.exp()
+                rw = rz + 1
+                ru = rw.log() + rw + ry
+                return ru
+
+        inputs = dict(x=self.torch.randn((5, 4)).to("cuda"))
+        ds = dict(x={0: "batch"})
+        Model()(**inputs)
+        ep = self.torch.export.export(
+            Model(), (), kwargs=inputs, dynamic_shapes=use_dyn_not_str(ds)
+        )
+        onx = to_onnx(ep, exporter="custom").model_proto
+        results = list(
+            run_aligned(
+                ep,
+                onx,
+                kwargs=inputs,
+                run_cls=OnnxruntimeEvaluator,
+                atol=1e-5,
+                rtol=1e-5,
+                verbose=11,
+                use_tensor=True,
+            ),
+        )
+        self.assertEqual(len(results), 8)
+
+    @hide_stdout()
+    @ignore_warnings((DeprecationWarning, FutureWarning, UserWarning))
+    @requires_cuda()
+    def test_sbs_dict_tensor_cuda_reshape(self):
+        class Model(self.torch.nn.Module):
+            def forward(self, x):
+                ry = x.abs()
+                ry1 = ry.reshape((-1, 1))
+                ry2 = ry.reshape((1, -1))
+                prod = ry1 * ry2
+                shape = prod.shape
+                resh = prod.reshape((-1, shape[0] // 2, shape[1] // 2))
+                return resh.transpose(2, 1)
+
+        inputs = dict(x=self.torch.randn((16, 16)).to("cuda"))
+        ds = dict(x={0: "batch"})
+        Model()(**inputs)
+        ep = self.torch.export.export(
+            Model(), (), kwargs=inputs, dynamic_shapes=use_dyn_not_str(ds)
+        )
+        onx = to_onnx(ep, exporter="custom").model_proto
+        results = list(
+            run_aligned(
+                ep,
+                onx,
+                kwargs=inputs,
+                run_cls=OnnxruntimeEvaluator,
+                atol=1e-5,
+                rtol=1e-5,
+                verbose=11,
+                use_tensor=True,
+            ),
+        )
+        self.assertEqual(len(results), 14)
+
+    @hide_stdout()
+    @ignore_warnings((DeprecationWarning, FutureWarning, UserWarning))
+    def test_sbs_dict_tensor_cpu_reshape(self):
+        class Model(self.torch.nn.Module):
+            def forward(self, x):
+                ry = x.abs()
+                ry1 = ry.reshape((-1, 1))
+                ry2 = ry.reshape((1, -1))
+                prod = ry1 * ry2
+                shape = prod.shape
+                resh = prod.reshape((-1, shape[0] // 2, shape[1] // 2))
+                return resh.transpose(2, 1)
+
+        inputs = dict(x=self.torch.randn((16, 16)))
+        ds = dict(x={0: "batch"})
+        Model()(**inputs)
+        ep = self.torch.export.export(
+            Model(), (), kwargs=inputs, dynamic_shapes=use_dyn_not_str(ds)
+        )
+        onx = to_onnx(ep, exporter="custom").model_proto
+        results = list(
+            run_aligned(
+                ep,
+                onx,
+                kwargs=inputs,
+                run_cls=OnnxruntimeEvaluator,
+                atol=1e-5,
+                rtol=1e-5,
+                verbose=11,
+                use_tensor=True,
+            ),
+        )
+        self.assertEqual(len(results), 14)
+        self.assertEqual([r[-1].get("dev", 0) for r in results], [0] * 14)
+
+    @hide_stdout()
+    @ignore_warnings((DeprecationWarning, FutureWarning, UserWarning))
+    def test_sbs_model_with_weights_custom(self):
+        torch = self.torch
+
+        class Model(self.torch.nn.Module):
+            def __init__(self):
+                super(Model, self).__init__()
+                self.fc1 = torch.nn.Linear(10, 32)  # input size 10 → hidden size 32
+                self.relu = torch.nn.ReLU()
+                self.fc2 = torch.nn.Linear(32, 1)  # hidden → output
+
+            def forward(self, x):
+                x = self.relu(self.fc1(x))
+                x = self.fc2(x)
+                return x
+
+        inputs = dict(x=self.torch.randn((5, 10)))
+        ds = dict(x={0: "batch"})
+        Model()(**inputs)
+        ep = self.torch.export.export(
+            Model(), (), kwargs=inputs, dynamic_shapes=use_dyn_not_str(ds)
+        )
+        filename = self.get_dump_file("test_sbs_model_with_weights_custom.onnx")
+        to_onnx(ep, exporter="custom", filename=filename)
+        onx = onnx.load(filename)
+        results = list(
+            run_aligned(
+                ep,
+                onx,
+                kwargs=inputs,
+                run_cls=OnnxruntimeEvaluator,
+                verbose=11,
+                use_tensor=True,
+            ),
+        )
+        df = pandas.DataFrame(list(map(post_process_run_aligned_obs, results)))
+        df.to_excel(self.get_dump_file("test_sbs_model_with_weights_custom.xlsx"))
+        self.assertEqual(
+            [
+                "ep_id_node",
+                "ep_name",
+                "ep_target",
+                "err_abs",
+                "err_dev",
+                "err_rel",
+                "onnx_id_node",
+                "onnx_name",
+                "onnx_op_type",
+                "shape_type",
+            ],
+            sorted(df.columns),
+        )
+        self.assertEqual(len(results), 12)
+        self.assertEqual([r[-1].get("dev", 0) for r in results], [0] * 12)
+        self.assertEqual(
+            [-1.0, -1.0, -1.0, -1.0, -10.0, -10.0, -10.0, -10.0, -1.0, 0.0, 1.0, 2.0],
+            df["onnx_id_node"].fillna(-10).tolist(),
+        )
+        self.clean_dump()
+
+    @hide_stdout()
+    @ignore_warnings((DeprecationWarning, FutureWarning, UserWarning))
+    def test_sbs_model_with_weights_dynamo(self):
+        torch = self.torch
+
+        class Model(self.torch.nn.Module):
+            def __init__(self):
+                super(Model, self).__init__()
+                self.fc1 = torch.nn.Linear(10, 32)  # input size 10 → hidden size 32
+                self.relu = torch.nn.ReLU()
+                self.fc2 = torch.nn.Linear(32, 1)  # hidden → output
+
+            def forward(self, x):
+                x = self.relu(self.fc1(x))
+                x = self.fc2(x)
+                return x
+
+        inputs = dict(x=self.torch.randn((5, 10)))
+        ds = dict(x={0: "batch"})
+        Model()(**inputs)
+        ep = self.torch.export.export(
+            Model(), (), kwargs=inputs, dynamic_shapes=use_dyn_not_str(ds)
+        )
+        filename = self.get_dump_file("test_sbs_model_with_weights_dynamo.onnx")
+        to_onnx(ep, exporter="onnx-dynamo", filename=filename)
+        onx = onnx.load(filename)
+        results = list(
+            run_aligned(
+                ep,
+                onx,
+                kwargs=inputs,
+                run_cls=OnnxruntimeEvaluator,
+                verbose=11,
+                use_tensor=True,
+            ),
+        )
+        df = pandas.DataFrame(list(map(post_process_run_aligned_obs, results)))
+        df.to_excel(self.get_dump_file("test_sbs_model_with_weights_dynamo.xlsx"))
+        self.assertEqual(
+            [
+                "ep_id_node",
+                "ep_name",
+                "ep_target",
+                "err_abs",
+                "err_dev",
+                "err_rel",
+                "onnx_id_node",
+                "onnx_name",
+                "onnx_op_type",
+                "shape_type",
+            ],
+            sorted(df.columns),
+        )
+        self.assertEqual(len(results), 12)
+        self.assertEqual([r[-1].get("dev", 0) for r in results], [0] * 12)
+        self.assertEqual(
+            [-1.0, -1.0, -1.0, -1.0, -10.0, -10.0, -10.0, -10.0, -1.0, 0.0, 1.0, 2.0],
+            df["onnx_id_node"].fillna(-10).tolist(),
+        )
+        self.clean_dump()
+
+    @hide_stdout()
+    def test_sbs_unique_consecutive(self):
+        torch = self.torch
+
+        class Model(torch.nn.Module):
+            def forward(self, x):
+                return torch.unique_consecutive(x)
+
+        model = Model()
+        inputs = (torch.tensor([0, 1, 2, 2, 3, 3, 0, 0], dtype=torch.int64),)
+        ds = ({0: "length"},)
+        ep = torch.export.export(model, inputs, dynamic_shapes=use_dyn_not_str(ds))
+        onx = to_onnx(model, inputs, dynamic_shapes=ds, exporter="custom").model_proto
+        results = list(
+            run_aligned(
+                ep,
+                onx,
+                kwargs=inputs,
+                run_cls=OnnxruntimeEvaluator,
+                verbose=11,
+                use_tensor=True,
+            ),
+        )
+        self.assertEqual(len(results), 5)
 
 
 if __name__ == "__main__":
diff --git a/_unittests/ut_xrun_doc/test_check_ort_float16.py b/_unittests/ut_xrun_doc/test_check_ort_float16.py
index 6adacc71..0f7fb120 100644
--- a/_unittests/ut_xrun_doc/test_check_ort_float16.py
+++ b/_unittests/ut_xrun_doc/test_check_ort_float16.py
@@ -141,6 +141,7 @@ def common_scatter(self, opset, providers, dtype, reduction, expected_names):
                 short_list,
                 tuple([("CUDAExecutionProvider", o) for o in en] for en in expected_names),
             )
+        self.clean_dump()
 
     @unittest.skip("https://github.com/sdpython/onnx-diagnostic/issues/240")
     @requires_cuda()
diff --git a/_unittests/ut_xrun_doc/test_command_lines.py b/_unittests/ut_xrun_doc/test_command_lines.py
index b478f538..5317190f 100644
--- a/_unittests/ut_xrun_doc/test_command_lines.py
+++ b/_unittests/ut_xrun_doc/test_command_lines.py
@@ -9,6 +9,7 @@
     get_parser_find,
     get_parser_lighten,
     get_parser_print,
+    get_parser_sbs,
     get_parser_stats,
     get_parser_unlighten,
     get_parser_validate,
@@ -79,6 +80,13 @@ def test_parser_agg(self):
         text = st.getvalue()
         self.assertIn("--recent", text)
 
+    def test_parser_sbs(self):
+        st = StringIO()
+        with redirect_stdout(st):
+            get_parser_sbs().print_help()
+        text = st.getvalue()
+        self.assertIn("--onnx", text)
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)
diff --git a/onnx_diagnostic/__init__.py b/onnx_diagnostic/__init__.py
index 9f99501a..e8c842df 100644
--- a/onnx_diagnostic/__init__.py
+++ b/onnx_diagnostic/__init__.py
@@ -3,5 +3,5 @@
 Functions, classes to dig into a model when this one is right, slow, wrong...
 """
 
-__version__ = "0.8.2"
+__version__ = "0.8.3"
 __author__ = "Xavier Dupré"
diff --git a/onnx_diagnostic/_command_lines_parser.py b/onnx_diagnostic/_command_lines_parser.py
index 854943bf..3f61915e 100644
--- a/onnx_diagnostic/_command_lines_parser.py
+++ b/onnx_diagnostic/_command_lines_parser.py
@@ -4,6 +4,7 @@
 import re
 import sys
 import textwrap
+import time
 import onnx
 from typing import Any, Dict, List, Optional, Union
 from argparse import ArgumentParser, RawTextHelpFormatter, BooleanOptionalAction
@@ -1104,6 +1105,153 @@ def _cmd_agg(argv: List[Any]):
         print(f"Wrote {args.output!r}")
 
 
+def get_parser_sbs() -> ArgumentParser:
+    parser = ArgumentParser(
+        prog="side-by-side (sbs)",
+        description=textwrap.dedent(
+            """
+            Compares the intermediate outputs between the exported program and
+            the exported onnx model. It assumes some names are common.
+            The execution of the exported program and the onnx model
+            are done in parallel. The device is the one used to store the
+            model and the inputs.s
+            """
+        ),
+        epilog="Where do discrepancies start? This function tries to answer that question.",
+    )
+    parser.add_argument(
+        "-i",
+        "--inputs",
+        type=str,
+        required=True,
+        help="model inputs saved with torch.save",
+    )
+    parser.add_argument(
+        "--ep",
+        type=str,
+        required=True,
+        help="exported program saved with torch.export.save",
+    )
+    parser.add_argument(
+        "-m",
+        "--onnx",
+        type=str,
+        required=True,
+        help="exported model in onnx format",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        type=str,
+        required=True,
+        help="output name to stored what the command line produces, "
+        "it should be an excel file",
+    )
+    parser.add_argument(
+        "--atol",
+        default=1e-5,
+        required=False,
+        help="absolute tolerance",
+    )
+    parser.add_argument(
+        "--rtol",
+        default=1e-5,
+        required=False,
+        help="relative tolerance",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        default=0,
+        required=False,
+        help="verbosity",
+    )
+    parser.add_argument(
+        "-r",
+        "--ratio",
+        default=5,
+        required=False,
+        help="Saves the result in an excel file every <ratio> node.",
+    )
+    return parser
+
+
+def _cmd_sbs(argv: List[Any]):
+    import pandas
+    import torch
+    from .helpers import string_type
+    from .torch_onnx.sbs import run_aligned, post_process_run_aligned_obs
+    from .reference import OnnxruntimeEvaluator
+
+    parser = get_parser_sbs()
+    args = parser.parse_args(argv[1:])
+
+    def _size(name):
+        s = os.stat(name).st_size
+        return f"{s / 2**20:1.3f} Mb"
+
+    print("-- side by side")
+    print(f"-- ep:     {_size(args.ep)}: {args.ep}")
+    print(f"-- inputs: {_size(args.inputs)}: {args.inputs}")
+    print(f"-- onnx:   {_size(args.onnx)}: {args.onnx}")
+    print(f"-- output: {args.output}")
+
+    print(f"-- load inputs {args.inputs!r}")
+    begin = time.perf_counter()
+    inputs = torch.load(args.inputs)
+    s = string_type(inputs, with_shape=True, with_device=True)
+    print(f"-- done in {time.perf_counter() - begin:1.1f}s - {s}")
+
+    if isinstance(inputs, dict) and len(inputs) == 2 and set(inputs) == {"args", "kwargs"}:
+        margs = inputs["args"]
+        mkwargs = inputs["kwargs"]
+    elif isinstance(inputs, tuple):
+        margs = inputs
+        mkwargs = {}
+    elif isinstance(inputs, dict):
+        margs = tuple()
+        mkwargs = inputs
+    else:
+        raise ValueError(
+            f"Unable to infer args, kwargs from inputs {string_type(inputs, with_shape=True)}"
+        )
+
+    print(f"-- load ep {args.ep!r}")
+    begin = time.perf_counter()
+    ep = torch.export.load(args.ep)
+    print(f"-- done in {time.perf_counter() - begin:1.1f}s")
+
+    print(f"-- load onnx {args.onnx!r}")
+    begin = time.perf_counter()
+    onx = onnx.load(args.onnx)
+    print(f"-- done in {time.perf_counter() - begin:1.1f}s")
+
+    print("-- starts side-by-side")
+    ratio = int(args.ratio)
+    data = []
+    for obs in run_aligned(
+        ep,
+        onx,
+        run_cls=OnnxruntimeEvaluator,  # type: ignore[arg-type]
+        atol=float(args.atol),
+        rtol=float(args.rtol),
+        verbose=int(args.verbose),
+        args=margs,
+        kwargs=mkwargs,
+        use_tensor=True,
+        exc=False,
+    ):
+        pobs = post_process_run_aligned_obs(obs)
+        data.append(pobs)
+        if "initializer" not in pobs and "placeholder" not in pobs and len(data) % ratio == 0:
+            df = pandas.DataFrame(data)
+            df.to_excel(args.output)
+    print(f"-- final saves into {args.output!r}")
+    df = pandas.DataFrame(data)
+    df.to_excel(args.output)
+    print("-- done")
+
+
 def get_main_parser() -> ArgumentParser:
     parser = ArgumentParser(
         prog="onnx_diagnostic",
@@ -1120,6 +1268,7 @@ def get_main_parser() -> ArgumentParser:
             find         - find node consuming or producing a result
             lighten      - makes an onnx model lighter by removing the weights,
             print        - prints the model on standard output
+            sbs          - compares an exported program and a onnx model
             stats        - produces statistics on a model
             unlighten    - restores an onnx model produces by the previous experiment
             validate     - validate a model
@@ -1135,6 +1284,7 @@ def get_main_parser() -> ArgumentParser:
             "find",
             "lighten",
             "print",
+            "sbs",
             "stats",
             "unlighten",
             "validate",
@@ -1146,15 +1296,16 @@ def get_main_parser() -> ArgumentParser:
 
 def main(argv: Optional[List[Any]] = None):
     fcts = dict(
+        agg=_cmd_agg,
+        config=_cmd_config,
+        exportsample=_cmd_export_sample,
+        find=_cmd_find,
         lighten=_cmd_lighten,
-        unlighten=_cmd_unlighten,
         print=_cmd_print,
-        find=_cmd_find,
-        config=_cmd_config,
-        validate=_cmd_validate,
+        sbs=_cmd_sbs,
         stats=_cmd_stats,
-        agg=_cmd_agg,
-        exportsample=_cmd_export_sample,
+        unlighten=_cmd_unlighten,
+        validate=_cmd_validate,
     )
 
     if argv is None:
@@ -1169,15 +1320,16 @@ def main(argv: Optional[List[Any]] = None):
             parser.parse_args(argv)
         else:
             parsers = dict(
+                agg=get_parser_agg,
+                config=get_parser_config,
+                exportsample=lambda: get_parser_validate("exportsample"),  # type: ignore[operator]
+                find=get_parser_find,
                 lighten=get_parser_lighten,
-                unlighten=get_parser_unlighten,
                 print=get_parser_print,
-                find=get_parser_find,
-                config=get_parser_config,
-                validate=get_parser_validate,
+                sbs=get_parser_sbs,
                 stats=get_parser_stats,
-                agg=get_parser_agg,
-                exportsample=lambda: get_parser_validate("exportsample"),  # type: ignore[operator]
+                unlighten=get_parser_unlighten,
+                validate=get_parser_validate,
             )
             cmd = argv[0]
             if cmd not in parsers:
diff --git a/onnx_diagnostic/ext_test_case.py b/onnx_diagnostic/ext_test_case.py
index efb8e633..f1cec362 100644
--- a/onnx_diagnostic/ext_test_case.py
+++ b/onnx_diagnostic/ext_test_case.py
@@ -9,6 +9,7 @@
 import logging
 import os
 import re
+import shutil
 import sys
 import unittest
 import warnings
@@ -147,7 +148,7 @@ def hide_stdout(f: Optional[Callable] = None) -> Callable:
 
     def wrapper(fct):
         def call_f(self):
-            if os.environ.get("UNHIDE", ""):
+            if os.environ.get("UNHIDE", "") in (1, "1", "True", "true"):
                 fct(self)
                 return
             st = StringIO()
@@ -806,6 +807,15 @@ def get_dump_folder(self, folder: str) -> str:
             os.makedirs(folder)
         return folder
 
+    def clean_dump(self, folder: str = "dump_test"):
+        """Cleans this folder."""
+        for item in os.listdir(folder):
+            item_path = os.path.join(folder, item)
+            if os.path.isfile(item_path) or os.path.islink(item_path):
+                os.remove(item_path)
+            elif os.path.isdir(item_path):
+                shutil.rmtree(item_path)
+
     def dump_onnx(
         self,
         name: str,
diff --git a/onnx_diagnostic/helpers/helper.py b/onnx_diagnostic/helpers/helper.py
index 092070b6..665954f5 100644
--- a/onnx_diagnostic/helpers/helper.py
+++ b/onnx_diagnostic/helpers/helper.py
@@ -529,16 +529,20 @@ def string_type(
                 return "OV(NO-NUMPY:FIXIT)"
             if verbose:
                 print(f"[string_type] V4:{type(obj)}")
-            return f"OV({string_type(t, with_shape=with_shape, with_min_max=with_min_max)})"
+            dev = ("G" if obj.device_name() == "Cuda" else "C") if with_device else ""
+            return (
+                f"{dev}OV({string_type(t, with_shape=with_shape, with_min_max=with_min_max)})"
+            )
         dt = obj.element_type()
         shape = obj.shape()
+        dev = ("G" if obj.device_name() == "Cuda" else "C") if with_device else ""
         if with_shape:
             if verbose:
                 print(f"[string_type] V5:{type(obj)}")
-            return f"OV{dt}s{'x'.join(map(str, shape))}"
+            return f"{dev}OV{dt}s{'x'.join(map(str, shape))}"
         if verbose:
             print(f"[string_type] V6:{type(obj)}")
-        return f"OV{dt}r{len(shape)}"
+        return f"{dev}OV{dt}r{len(shape)}"
 
     # others classes
 
@@ -990,7 +994,7 @@ def max_diff(
     _index: int = 0,
     allow_unique_tensor_with_list_of_one_element: bool = True,
     hist: Optional[Union[bool, List[float]]] = None,
-) -> Dict[str, Union[float, int, Tuple[int, ...]]]:
+) -> Dict[str, Union[float, int, Tuple[Any, ...]]]:
     """
     Returns the maximum discrepancy.
 
@@ -1015,6 +1019,7 @@ def max_diff(
         output, this number will be the number of elements
         of this output
     * dnan: difference in the number of nan
+    * dev: tensor on the same device, if applicable
 
     You may use :func:`string_diff` to display the discrepancies in one string.
     """
@@ -1167,7 +1172,7 @@ def max_diff(
 
         if verbose >= 6:
             print(f"[max_diff] list,tuple,6: {string_type(expected)} ? {string_type(got)}")
-        am, rm, sm, n, dn, drep = 0, 0, 0.0, 0.0, 0, None
+        am, rm, sm, n, dn, drep, dd = 0, 0, 0.0, 0.0, 0, None, None
         for ip, (e, g) in enumerate(zip(expected, got)):
             d = max_diff(
                 e,
@@ -1199,7 +1204,15 @@ def max_diff(
                 else:
                     for k, v in d["rep"].items():
                         drep[k] += v
+            if "dev" in d and d["dev"] is not None:
+                if dd is None:
+                    dd = d["dev"]
+                else:
+                    dd += d["dev"]  # type: ignore[operator]
+
         res = dict(abs=am, rel=rm, sum=sm, n=n, dnan=dn)
+        if dd is not None:
+            res["dev"] = dd
         if drep:
             res["rep"] = drep
         return res  # type: ignore
@@ -1233,33 +1246,42 @@ def max_diff(
     import torch
 
     if isinstance(expected, np.ndarray) or isinstance(got, np.ndarray):
+        dev = None
         if isinstance(expected, torch.Tensor):
             from .torch_helper import to_numpy
 
+            dev = 0 if expected.device.type == "cpu" else 1
             expected = to_numpy(expected)
         if isinstance(got, torch.Tensor):
             from .torch_helper import to_numpy
 
+            dev = 0 if got.device.type == "cpu" else 1
             got = to_numpy(got)
         if verbose >= 6:
             print(f"[max_diff] tensor: {string_type(expected)} ? {string_type(got)}")
 
         if _index < begin or (end != -1 and _index >= end):
             # out of boundary
-            return dict(abs=0.0, rel=0.0, sum=0.0, n=0.0, dnan=0)
+            res = dict(abs=0.0, rel=0.0, sum=0.0, n=0.0, dnan=0)
+            if dev is not None:
+                res["dev"] = dev  # type: ignore[operator]
+            return res  # type: ignore[return-value]
         if isinstance(expected, (int, float)):
             if isinstance(got, np.ndarray) and len(got.shape) == 0:
                 got = float(got)
             if isinstance(got, (int, float)):
                 if expected == got:
                     return dict(abs=0.0, rel=0.0, sum=0.0, n=0.0, dnan=0)
-                return dict(
+                res = dict(
                     abs=abs(expected - got),
                     rel=abs(expected - got) / (abs(expected) + 1e-5),
                     sum=abs(expected - got),
                     n=1,
                     dnan=0,
                 )
+                if dev is not None:
+                    res["dev"] = dev
+                return res  # type: ignore[return-value]
             return dict(abs=np.inf, rel=np.inf, sum=np.inf, n=np.inf, dnan=np.inf)
         if expected.dtype in (np.complex64, np.complex128):
             if got.dtype == expected.dtype:
@@ -1339,6 +1361,8 @@ def max_diff(
         res: Dict[str, float] = dict(  # type: ignore
             abs=abs_diff, rel=rel_diff, sum=sum_diff, n=n_diff, dnan=nan_diff, argm=argm
         )
+        if dev is not None:
+            res["dev"] = dev
         if hist:
             if isinstance(hist, bool):
                 hist = np.array([0, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100], dtype=diff.dtype)
@@ -1352,9 +1376,14 @@ def max_diff(
     if isinstance(expected, torch.Tensor) and isinstance(got, torch.Tensor):
         if verbose >= 6:
             print(f"[max_diff] tensor: {string_type(expected)} ? {string_type(got)}")
+        dev = 0 if expected.device == got.device else 1
         if _index < begin or (end != -1 and _index >= end):
             # out of boundary
-            return dict(abs=0.0, rel=0.0, sum=0.0, n=0.0, dnan=0)
+            if verbose >= 10:
+                if debug_info:
+                    print("\n".join(debug_info))
+                print("[max_diff] out of boundary")
+            return dict(abs=0.0, rel=0.0, sum=0.0, n=0.0, dnan=0, dev=dev)
         if expected.dtype in (torch.complex64, torch.complex128):
             if got.dtype == expected.dtype:
                 got = torch.view_as_real(got)
@@ -1448,7 +1477,13 @@ def max_diff(
                 )
 
         res: Dict[str, float] = dict(  # type: ignore
-            abs=abs_diff, rel=rel_diff, sum=sum_diff, n=n_diff, dnan=nan_diff, argm=argm
+            abs=abs_diff,
+            rel=rel_diff,
+            sum=sum_diff,
+            n=n_diff,
+            dnan=nan_diff,
+            argm=argm,
+            dev=dev,
         )
         if hist:
             if isinstance(hist, bool):
@@ -1466,13 +1501,31 @@ def max_diff(
             )
         return res  # type: ignore
 
+    if isinstance(expected, int) and isinstance(got, torch.Tensor):
+        # a size
+        if verbose >= 6:
+            print(f"[max_diff] int: {string_type(expected)} ? {string_type(got)}")
+        if got.shape != tuple():
+            return dict(  # type: ignore
+                abs=np.inf,
+                rel=np.inf,
+                sum=np.inf,
+                n=np.inf,
+                dnan=np.inf,
+                argm=np.inf,
+            )
+        return dict(  # type: ignore
+            abs=abs(expected - got.item()),
+            rel=abs((expected - got.item()) / max(1, expected)),
+            sum=abs(expected - got.item()),
+            n=1,
+            dnan=0,
+        )
+
     if "SquashedNormal" in expected.__class__.__name__:
         if verbose >= 6:
             print(f"[max_diff] SquashedNormal: {string_type(expected)} ? {string_type(got)}")
-        values = (
-            expected.mean.detach().to("cpu"),
-            expected.scale.detach().to("cpu"),
-        )
+        values = (expected.mean, expected.scale)
         return max_diff(values, got, debug_info=_debug("SquashedNormal"), **_dkws)
 
     if expected.__class__ in torch.utils._pytree.SUPPORTED_NODES:
@@ -1677,7 +1730,7 @@ def max_diff(
 
     raise AssertionError(
         f"Not implemented with implemented with expected="
-        f"{string_type(expected)}, got={string_type(got)},\n"
+        f"{string_type(expected)} ({type(expected)}), got={string_type(got)},\n"
         f"level={level}"
     )
 
@@ -1685,6 +1738,9 @@ def max_diff(
 def string_diff(diff: Dict[str, Any]) -> str:
     """Renders discrepancies return by :func:`max_diff` into one string."""
     # dict(abs=, rel=, sum=, n=n_diff, dnan=)
+    if "dev" in diff:
+        ddiff = {k: v for k, v in diff.items() if k != "dev"}
+        return f"{string_diff(ddiff)}, dev={diff['dev']}"
     suffix = ""
     if "rep" in diff:
         rows = []
diff --git a/onnx_diagnostic/helpers/ort_session.py b/onnx_diagnostic/helpers/ort_session.py
index 7477c8cd..69f41826 100644
--- a/onnx_diagnostic/helpers/ort_session.py
+++ b/onnx_diagnostic/helpers/ort_session.py
@@ -338,6 +338,7 @@ class InferenceSessionForTorch(_InferenceSession):
     :param optimized_model_filepath:  see :class:`onnxruntime.SessionOptions`
     :param disable_aot_function_inlining:  see :class:`onnxruntime.SessionOptions`
     :param use_training_api: use onnxruntime-traning API
+    :param cpu_output: if True, force the outputs to be on CPU
     """
 
     def __init__(
@@ -353,6 +354,7 @@ def __init__(
         optimized_model_filepath: Optional[str] = None,
         disable_aot_function_inlining: Optional[bool] = None,
         use_training_api: Optional[bool] = None,
+        cpu_outputs: bool = False,
     ):
         super().__init__(
             sess,
@@ -367,6 +369,7 @@ def __init__(
             disable_aot_function_inlining=disable_aot_function_inlining,
             use_training_api=use_training_api,
         )
+        self.cpu_outputs = cpu_outputs
 
     def _get_ortvalues_from_torch_tensors(
         self, tensors: Tuple[torch.Tensor, ...], n_outputs: int
@@ -490,23 +493,36 @@ def run_dlpack(
         feeds is a dictionary of :class:`torch.Tensor`.
         The output device is CPU even if the outputs are on CUDA.
         """
-        new_feeds = {}
+        input_names = []
+        values = ORTC.OrtValueVector()
+        device = -1
         for k, v in feeds.items():
+            device = max(device, v.get_device())
             assert hasattr(v, "__dlpack__"), f"class {type(v)} should be serialized"
             if not v.is_contiguous():
                 v = v.contiguous()
             if v.dtype == torch.bool:
-                # It does not work with dlpack
-                # unless onnxruntime updates the version it is using.
-                new_feeds[k] = ORTC.OrtValue.ortvalue_from_numpy_with_onnx_type(
-                    v.detach().numpy(), onnx.TensorProto.BOOL
-                )
+                v = v.to(torch.uint8)
+                v = ORTC.OrtValue.from_dlpack(v.__dlpack__(), True)
             else:
-                new_feeds[k] = ORTC.OrtValue.from_dlpack(v.__dlpack__(), False)
+                v = ORTC.OrtValue.from_dlpack(v.__dlpack__(), False)
+            input_names.append(k)
+            values.push_back(v)
         if self.nvtx:
-            self.torch.cuda.nvtx.range_push("run_with_ort_values")
-        ort_outputs = self.sess._sess.run_with_ort_values(
-            new_feeds, output_names or self.output_names, self.run_options
+            self.torch.cuda.nvtx.range_push("run_with_ortvaluevector")
+
+        # ort_outputs = self.sess._sess.run_with_ort_values(
+        #    new_feeds, output_names or self.output_names, self.run_options
+        # )
+        ort_outputs = ORTC.OrtValueVector()
+        out_names = output_names or self.output_names
+        self.sess._sess.run_with_ortvaluevector(
+            self.run_options,
+            input_names,
+            values,
+            out_names,
+            ort_outputs,
+            [DEVICES[-1 if self.cpu_outputs else device] for o in out_names],
         )
         if self.nvtx:
             self.torch.cuda.nvtx.range_pop()
diff --git a/onnx_diagnostic/helpers/torch_helper.py b/onnx_diagnostic/helpers/torch_helper.py
index e86dbef6..ec220b29 100644
--- a/onnx_diagnostic/helpers/torch_helper.py
+++ b/onnx_diagnostic/helpers/torch_helper.py
@@ -30,9 +30,7 @@
 
 
 def proto_from_tensor(
-    arr: "torch.Tensor",  # noqa: F821
-    name: Optional[str] = None,
-    verbose: int = 0,
+    arr: torch.Tensor, name: Optional[str] = None, verbose: int = 0
 ) -> onnx.TensorProto:
     """
     Converts a torch Tensor into a TensorProto.
@@ -98,7 +96,7 @@ def proto_from_tensor(
     return tensor
 
 
-def onnx_dtype_to_torch_dtype(itype: int) -> "torch.dtype":  # noqa: F821
+def onnx_dtype_to_torch_dtype(itype: int) -> torch.dtype:
     """
     Converts an onnx type into a torch dtype.
 
@@ -140,7 +138,7 @@ def onnx_dtype_to_torch_dtype(itype: int) -> "torch.dtype":  # noqa: F821
     )
 
 
-def torch_dtype_to_onnx_dtype(to: "torch.dtype") -> int:  # noqa: F821
+def torch_dtype_to_onnx_dtype(to: torch.dtype) -> int:
     """
     Converts a torch dtype into a onnx element type.
 
@@ -483,7 +481,7 @@ def is_torchdynamo_exporting() -> bool:
             return False
 
 
-def to_numpy(tensor: "torch.Tensor") -> np.ndarray:  # noqa: F821
+def to_numpy(tensor: torch.Tensor) -> np.ndarray:
     """Converts a :class:`torch.Tensor` to :class:`numpy.ndarray`."""
     try:
         return tensor.detach().cpu().numpy()
@@ -498,6 +496,21 @@ def to_numpy(tensor: "torch.Tensor") -> np.ndarray:  # noqa: F821
     return tensor.detach().to(torch.float32).cpu().numpy().astype(conv[tensor.dtype])
 
 
+def from_numpy(tensor: np.ndarray) -> torch.Tensor:
+    """Converts a :class:`numpy.ndarray` to :class:`torch.Tensor`."""
+    try:
+        return torch.from_numpy(tensor)
+    except TypeError:
+        # We try with ml_dtypes
+        pass
+
+    import ml_dtypes
+
+    conv = {ml_dtypes.bfloat16: torch.bfloat16}
+    assert tensor.dtype in conv, f"Unsupported type {tensor.dtype}, not in {conv}"
+    return torch.from_numpy(tensor.astype(torch.float32)).to(conv[tensor.dtype])
+
+
 def replace_string_by_dynamic(dynamic_shapes: Any) -> Any:
     """Replaces strings by ``torch.export.Dim.DYNAMIC``."""
     import torch
diff --git a/onnx_diagnostic/reference/ort_evaluator.py b/onnx_diagnostic/reference/ort_evaluator.py
index cba391ef..8ac90321 100644
--- a/onnx_diagnostic/reference/ort_evaluator.py
+++ b/onnx_diagnostic/reference/ort_evaluator.py
@@ -373,6 +373,12 @@ def _make_model_proto(
                 )
         else:
             onx.opset_import.append(oh.make_opsetid("", onnx_opset_version()))
+        opsets = {d.domain: d.version for d in onx.opset_import}
+        add = {}
+        for node in nodes:
+            if node.domain and node.domain not in opsets and node.domain not in add:
+                add[node.domain] = 1
+        onx.opset_import.extend([oh.make_opsetid(k, v) for k, v in add.items()])
 
         # That helps fixing bugs.
         onx = shi.infer_shapes(onx)
@@ -413,6 +419,7 @@ def _get_hidden_node_inputs(self, node: NodeProto) -> Set[str]:
     def _get_sess(
         self, node: Union[ModelProto, NodeProto], inputs: List[Any]
     ) -> Tuple[ModelProto, _InferenceSession]:
+        on_cpu = None
         if isinstance(node, ModelProto):
             onx = node
         else:
@@ -443,6 +450,8 @@ def _get_sess(
                 voutputs = [oh.make_value_info(o, TypeProto()) for o in node.output]
 
             onx = self._make_model_proto([node], vinputs, voutputs)
+            if node.op_type in {"Shape", "Size"}:
+                on_cpu = True
 
         cls = (
             InferenceSessionForNumpy
@@ -450,8 +459,17 @@ def _get_sess(
             and (not isinstance(self.torch_or_numpy, bool) or not self.torch_or_numpy)
             else InferenceSessionForTorch
         )
+        if (
+            "providers" not in self.session_kwargs or not self.session_kwargs["providers"]
+        ) and any(hasattr(t, "is_cuda") and t.is_cuda for t in inputs):
+            sess_kwargs = self.session_kwargs.copy()
+            sess_kwargs["providers"] = ["CUDAExecutionProvider"]
+        else:
+            sess_kwargs = self.session_kwargs or {}
+        if on_cpu and "CUDAExecutionProvider" in (sess_kwargs.get("providers", []) or []):
+            sess_kwargs["cpu_outputs"] = True
         try:
-            sess = cls(onx, **self.session_kwargs)
+            sess = cls(onx, **sess_kwargs)
         except (
             onnxruntime.capi.onnxruntime_pybind11_state.Fail,
             onnxruntime.capi.onnxruntime_pybind11_state.InvalidGraph,
@@ -540,7 +558,15 @@ def _run(self, node: NodeProto, inputs: List[Any], results: Dict[str, Any]) -> L
 
         feeds = dict(zip(node.input, inputs))
         if "" in feeds:
-            feeds[""] = np.array([0], dtype=np.float32)
+            cls = None
+            for k, v in feeds.items():
+                if k != "":
+                    cls = v.__class__
+                    break
+            assert (
+                cls is not None
+            ), f"Unable to get input class (array or tensor), feeds={string_type(feeds)}"
+            feeds[""] = cls([0])
 
         assert hasattr(sess, "run"), f"Missing method run for type {type(sess)}"
         outputs = list(sess.run(None, feeds))
diff --git a/onnx_diagnostic/torch_onnx/sbs.py b/onnx_diagnostic/torch_onnx/sbs.py
index 01e36080..6b4f2359 100644
--- a/onnx_diagnostic/torch_onnx/sbs.py
+++ b/onnx_diagnostic/torch_onnx/sbs.py
@@ -1,9 +1,12 @@
-from typing import Any, Dict, Iterator, Optional, Tuple, Union
+import inspect
+from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union
 import onnx
+import onnx.helper as oh
+import numpy as np
 import torch
-from ..helpers import string_type, string_diff, max_diff
-from ..helpers.onnx_helper import to_array_extended
-from ..helpers.torch_helper import to_numpy
+from ..helpers import string_type, string_diff, max_diff, flatten_object
+from ..helpers.onnx_helper import pretty_onnx
+from ..helpers.torch_helper import to_numpy, from_numpy
 
 
 def validate_fx_tensor(
@@ -105,7 +108,27 @@ def run_fx_node(
         return args
     if node.op == "call_function":
         assert callable(node.target), f"{node.target!r} not callable in node {node!r}"
-        outputs = node.target(*args, **(kwargs or {}))
+        for a, ea in zip(args, node.args):
+            if isinstance(a, torch.Tensor) and hasattr(ea, "meta") and "val" in ea.meta:
+                ta = ea.meta["val"]
+                assert (
+                    isinstance(ta, torch.Tensor)
+                    and len(a.shape) == len(ta.shape)
+                    and a.dtype == ta.dtype
+                ), (
+                    f"Unable to run node {node!r}, target={node.target!r}, "
+                    f"node.args={node.args!r}, node.kwargs={node.kwargs!r}, "
+                    f"args={string_type(args, with_shape=True, with_device=True)}, "
+                    f"kwargs={string_type(kwargs, with_shape=True, with_device=True)}"
+                )
+        try:
+            outputs = node.target(*args, **(kwargs or {}))
+        except RuntimeError as e:
+            raise RuntimeError(
+                f"Unable to run node {node!r}, target={node.target!r}, "
+                f"args={string_type(args, with_shape=True, with_device=True)}, "
+                f"kwargs={string_type(kwargs, with_shape=True, with_device=True)}"
+            ) from e
         validate_fx_outputs(node, outputs)
         return outputs
     raise NotImplementedError(
@@ -127,6 +150,8 @@ def _pick_result(torch_results: Dict[str, Any], ref: Any) -> Any:
         return ref
     if ref is None:
         return None
+    if isinstance(ref, torch.layout):
+        return ref
     raise NotImplementedError(f"Unable to process args type {type(ref)}")
 
 
@@ -145,13 +170,65 @@ def prepare_args_kwargs(
     return new_args, new_kwargs
 
 
+def post_process_run_aligned_obs(
+    obs: Tuple[
+        Optional[int],
+        Optional[int],
+        Optional[str],
+        Optional[str],
+        Optional[str],
+        Optional[str],
+        Dict[str, Optional[Union[int, float]]],
+    ],
+) -> Dict[str, Optional[Union[str, float, int]]]:
+    """
+    Flattens an observations produced by function
+    :func:`onnx_diagnostic.torch_onnx.sbs.run_aligned`.
+    """
+    dobs = dict(
+        zip(
+            [
+                "ep_id_node",
+                "onnx_id_node",
+                "ep_name",
+                "onnx_name",
+                "ep_target",
+                "onnx_op_type",
+                "shape_type",
+            ],
+            obs,
+        )
+    )
+    if "abs" in obs[-1] and obs[-1]["abs"] is not None:
+        dobs["err_abs"] = obs[-1]["abs"]  # type: ignore[assignment]
+    if "rel" in obs[-1] and obs[-1]["rel"] is not None:
+        dobs["err_rel"] = obs[-1]["rel"]  # type: ignore[assignment]
+    if "dev" in obs[-1] and obs[-1]["dev"] is not None:
+        dobs["err_dev"] = obs[-1]["dev"]  # type: ignore[assignment]
+    return dobs  # type: ignore[return-value]
+
+
 def run_aligned(
     ep: torch.export.ExportedProgram,
     onx: Union[onnx.ModelProto, onnx.FunctionProto],
-    args: Tuple[torch.Tensor, ...],
-    check_conversion_cls: Union[Dict[str, Any], type],
+    run_cls: Callable[
+        [
+            Union[
+                onnx.ModelProto,
+                onnx.FunctionProto,
+                onnx.GraphProto,
+                onnx.NodeProto,
+            ]
+        ],
+        List[Union[np.ndarray, torch.Tensor]],
+    ],
+    args: Optional[Tuple[torch.Tensor, ...]] = None,
     kwargs: Optional[Dict[str, Any]] = None,
+    use_tensor: bool = False,
+    atol: Optional[float] = None,
+    rtol: Optional[float] = None,
     verbose: int = 0,
+    exc: bool = True,
 ) -> Iterator[Tuple[Any, ...]]:
     """
     Runs in parallel both the exported program
@@ -162,11 +239,26 @@ def run_aligned(
 
     :param ep: exported program
     :param onx: model or function proto
+    :param run_cls: defines the runtime to use for this task
     :param args: input args
-    :param check_conversion_cls: defines the runtime to use for this task
     :param kwargs: input kwargs
+    :param use_tensor: use torch tensors instead of numpy arrays
+    :param atol: absolute tolerance
+    :param rtol: relative tolerance
     :param verbose: verbosity level
-    :return: a list of tuples containing the results, they come in tuple,
+    :param exc: stops if an exception
+    :return: a list of tuples containing the results, they come in tuple
+
+    Each tuple is:
+
+    - ep_id_node
+    - onnx_id_node
+    - ep_name
+    - onnx_name
+    - ep target name
+    - onnx op _type
+    - ep or onnx shape and type
+    - difference
 
     Example:
 
@@ -174,14 +266,16 @@ def run_aligned(
         :showcode:
         :warningout: UserWarning
 
-        import pprint
         import pandas
         import torch
         from onnx_diagnostic.reference import (
             # This can be replace by any runtime taking NodeProto as an input.
             ExtendedReferenceEvaluator as ReferenceEvaluator,
         )
-        from onnx_diagnostic.torch_onnx.sbs import run_aligned
+        from onnx_diagnostic.torch_onnx.sbs import (
+            run_aligned,
+            post_process_run_aligned_obs,
+        )
 
 
         class Model(torch.nn.Module):
@@ -193,13 +287,6 @@ def forward(self, x):
                 return ru
 
 
-        def post_process(obs):
-            dobs = dict(zip(["ep_id_node", "onnx_id_node", "ep_name", "onnx_name"], obs))
-            dobs["err_abs"] = obs[-1]["abs"]
-            dobs["err_rel"] = obs[-1]["rel"]
-            return dobs
-
-
         x = torch.randn((5, 4))
         Model()(x)  # to make sure the model is running
         ep = torch.export.export(
@@ -210,13 +297,94 @@ def post_process(obs):
         ).model_proto
         results = list(
             map(
-                post_process,
+                post_process_run_aligned_obs,
+                run_aligned(
+                    ep, onx, ReferenceEvaluator, (x,), atol=1e-5, rtol=1e-5, verbose=1
+                ),
+            ),
+        )
+        print("------------")
+        print("final results")
+        df = pandas.DataFrame(results)
+        print(df)
+
+
+    This example uses :class:`onnx.reference.ReferenceEvaluator` to run the onnx model
+    but onnxruntime can also be used through
+    :class:`onnx_diagnostic.helpers.ort_session.InferenceSessionForTorch`.
+    It relies on :epkg:`onnxruntime` and selects CPU or CUDA depending
+    on the device where the inputs are located.
+
+    The :class:`torch.export.ExportedProgram` can be saved on disk
+    with ``ep.save("<filename>.pt")`` and restored with
+    ``torch.export.load("<filename>.pt")``. That leeds the input to save.
+    We can decouple the export and the alignment.
+
+    .. runpython::
+        :showcode:
+        :warningout: UserWarning
+
+        import onnx
+        import torch
+        from onnx_diagnostic.torch_export_patches.patch_inputs import use_dyn_not_str
+
+
+        class Model(torch.nn.Module):
+            def forward(self, x):
+                ry = x.abs()
+                rz = ry.exp()
+                rw = rz + 1
+                ru = rw.log() + rw
+                return ru
+
+
+        x = torch.randn((5, 4))
+        dynamic_shapes = ({0: "batch"},)
+        Model()(x)  # to make sure the model is running
+        ep = torch.export.export(
+            Model(), (x,), dynamic_shapes=use_dyn_not_str(dynamic_shapes)
+        )
+        onx = torch.onnx.export(
+            Model(), (x,), dynamic_shapes=dynamic_shapes
+        ).model_proto
+
+        torch.export.save(ep, "test_doc_sbs_example.pt2")
+        onnx.save(onx, "test_doc_sbs_example.onnx")
+        torch.save((x,), "test_doc_sbs_example.pt")
+
+    Then we can restore all of them and run it.
+
+    .. runpython::
+        :showcode:
+        :warningout: UserWarning
+
+        import pandas
+        import onnx
+        import torch
+        from onnx_diagnostic.torch_onnx.sbs import (
+            run_aligned,
+            post_process_run_aligned_obs,
+        )
+        from onnx_diagnostic.reference import OnnxruntimeEvaluator
+
+
+        ep = torch.export.load("test_doc_sbs_example.pt2")
+        onx = onnx.load("test_doc_sbs_example.onnx")
+        inputs = torch.load("test_doc_sbs_example.pt")
+
+
+        results = list(
+            map(
+                post_process_run_aligned_obs,
                 run_aligned(
                     ep,
                     onx,
-                    (x,),
-                    check_conversion_cls=dict(cls=ReferenceEvaluator, atol=1e-5, rtol=1e-5),
+                    OnnxruntimeEvaluator,
+                    inputs,
+                    atol=1e-5,
+                    rtol=1e-5,
                     verbose=1,
+                    use_tensor=True,
                 ),
             ),
         )
@@ -224,19 +392,116 @@ def post_process(obs):
         print("final results")
         df = pandas.DataFrame(results)
         print(df)
+
+    A command line can also be run:
+
+    .. code-block:: bash
+
+            python -m onnx_diagnostic sbs -i <tensors>.input.pt \\
+                                          --ep <exported_program>.pt2 \\
+                                          -m <model>.onnx  \\
+                                          -o results.xlsx \\
+                                          -v 1 --atol=0.1 --rtol=1
     """
-    assert not kwargs, f"Not implemented when kwargs={string_type(kwargs,with_shape=True)}"
-    cls, atol, rtol = (
-        (
-            check_conversion_cls["cls"],
-            check_conversion_cls["atol"],
-            check_conversion_cls["rtol"],
-        )
-        if isinstance(check_conversion_cls, dict)
-        else (check_conversion_cls, None, None)
+    assert callable(run_cls), f"run_cls={run_cls} not a callable"
+    str_kws = dict(with_shape=True, with_device=True)
+    has_cuda = any(
+        (isinstance(t, torch.Tensor) and t.is_cuda)
+        for t in flatten_object([args, kwargs], drop_keys=True)
     )
+    default_device = None
+    if has_cuda:
+        for t in flatten_object([args, kwargs], drop_keys=True):
+            if t is not None and t.is_cuda:
+                default_device = t.device
+                break
+    run_cls_kwargs = {
+        "ir_version": onx.ir_version,
+        "opsets": {d.domain: d.version for d in onx.opset_import},
+        "verbose": max(verbose - 1, 0),
+        "providers": (
+            ["CUDAExecutionProvider", "CPUExecutionProvider"]
+            if has_cuda
+            else ["CPUExecutionProvider"]
+        ),
+    }
+    run_cls_kwargs = {
+        k: v
+        for k, v in run_cls_kwargs.items()
+        if k in set(inspect.signature(run_cls).parameters)
+    }
+    if verbose:
+        print(f"[run_aligned] run_cls={run_cls}")
+        print(f"[run_aligned] run_cls_kwargs={run_cls_kwargs}")
+
+    def _check_tensor_(name, obj, flip_type=False):
+        if flip_type:
+            if use_tensor:
+                if isinstance(obj, np.ndarray):
+                    obj = from_numpy(obj)
+            else:
+                if isinstance(obj, torch.Tensor):
+                    obj = to_numpy(obj)
+
+        assert not use_tensor or isinstance(obj, torch.Tensor), (
+            f"Unexpected type {type(obj)} for {name!r}. "
+            f"use_tensor is True so torch.Tensor is expected."
+        )
+        assert use_tensor or isinstance(obj, np.ndarray), (
+            f"Unexpected type {type(obj)} for {name!r}. "
+            f"use_tensor is False so np.array is expected."
+        )
+        return obj
+
+    def _make_node_from_initializer(proto: onnx.TensorProto) -> onnx.NodeProto:
+        return oh.make_node("Constant", [], [proto.name], value=proto)
+
+    def _loop_cmp(
+        mapping_onnx_to_torch,
+        torch_results,
+        onnx_results,
+        o,
+        r,
+        verbose,
+        atol,
+        rtol,
+        i,
+        i_onnx,
+    ):
+        onnx_results[o] = _check_tensor_(o, r)
+        if verbose:
+            print(f"[run_aligned-nx] +res: {o}={string_type(r, **str_kws)}")
+
+        to = mapping_onnx_to_torch.get(o, o)
+        if to in torch_results:
+            d = max_diff(torch_results[to], r)
+            if verbose:
+                if o == to:
+                    print(f"[run_aligned-==] cmp {to}: {string_diff(d)}")
+                else:
+                    print(f"[run_aligned-~~] cmd {to}/{o}: {string_diff(d)}")
+                if not (
+                    atol is None or rtol is None or (d["abs"] <= atol and d["rel"] <= rtol)
+                ):
+                    if exc:
+                        raise ValueError(
+                            f"discrepancies detected for results [{to}/{o}]: "
+                            f"{string_diff(d)}"
+                            f"\n-- torch_results: {string_type(torch_results[to], **str_kws)}"
+                            f"\n-- onnx_results: {string_type(r, **str_kws)}"
+                            f"\n-- torch\n{torch_results[to]}\n-- onnx\n{r}"
+                        )
+                    else:
+                        print(
+                            f"[run_align-dx] discrepancies "
+                            f"{string_diff(d, with_shape=True, with_device=True)} - "
+                            f"[{to}/{o}]"
+                        )
+            return (i, i_onnx, o, to, string_type(torch_results[to], **str_kws), d)
+        return None
 
-    # retrieve the positions
+    if verbose:
+        print(f"[run_aligned] walks through {len(ep.graph.nodes)} nodes from torch")
     positions: Dict[str, Any] = {}
     for i, node in enumerate(ep.graph.nodes):
         if isinstance(node.name, str):
@@ -245,6 +510,8 @@ def post_process(obs):
             for n in node.name:
                 positions[n] = dict(fx=i)
 
+    if verbose:
+        print(f"[run_aligned] walks through {len(onx.graph.node)} nodes from onnx")
     for i, node in enumerate(onx.graph.node):
         for n in node.output:
             if n in positions:
@@ -252,24 +519,36 @@ def post_process(obs):
             else:
                 positions[n] = dict(onnx=i)
 
+    if verbose:
+        print(f"[run_aligned] handles {len(onx.graph.initializer)} initializers from onnx")
     onnx_results: Dict[str, Any] = {}
     for init in onx.graph.initializer:  # type: ignore
         positions[init.name] = -1
-        onnx_results[init.name] = to_array_extended(init)
-        param_name = f"p_{init.name.replace('.', '_')}"
-        if param_name == init.name:
+        t = run_cls(
+            _make_node_from_initializer(init),
+            **run_cls_kwargs,
+        ).run(  # type: ignore[attr-defined]
+            None, {}
+        )[
+            0
+        ]
+        if default_device and t.numel() >= 1024:
+            # Let's force its way to cuda (should check the device has well).
+            t = t.to(default_device)
+        onnx_results[init.name] = _check_tensor_(init.name, t, flip_type=True)
+        if init.name.startswith("init"):
+            # not a weight
             continue
-        assert param_name not in onnx_results, (
-            f"Some confusion may happen because {init.name!r} -> {param_name!r} "
-            f"and onnx_results has {sorted(onnx_results)}"
-        )
-        onnx_results[param_name] = onnx_results[init.name]
 
-    torch_results: Dict[str, Any] = {
-        k: torch.from_numpy(v.copy())
-        for k, v in onnx_results.items()
-        if not k.startswith("init")
-    }
+    if verbose:
+        print(f"[run_aligned] handles common {len(onnx_results)} initializer from torch")
+    # we should be careful, torch may modified inplace the weights,
+    # it may be difficult to share weights
+    torch_results: Dict[str, Any] = {}
+    if verbose:
+        print(
+            f"[run_aligned] handles other constant from {len(ep.graph.nodes)} nodes from torch"
+        )
     last_position = 0
     torch_output_names = None
     for node in ep.graph.nodes:
@@ -285,26 +564,47 @@ def post_process(obs):
     mapping_onnx_to_torch = dict(zip(onnx_outputs_names, torch_output_names))
 
     if verbose:
+        print(f"[run_aligned]  torch {len(torch_results)} constants")
+        print(f"[run_aligned]   onnx {len(onnx_results)} constants")
+        print(f"[run_aligned] common {len(mapping_onnx_to_torch)} constants")
         for k, v in torch_results.items():
-            print(
-                f"[run_aligned] +torch-cst: {k}: "
-                f"{string_type(v, with_shape=True, with_min_max=True)}"
-            )
+            print(f"[run_aligned-ep] +cst: {k}: {string_type(v, **str_kws)}")
         for k, v in onnx_results.items():
-            print(
-                f"[run_aligned] +onnx-init: {k}: "
-                f"{string_type(v, with_shape=True, with_min_max=True)}"
-            )
+            print(f"[run_aligned-nx] +ini: {k}: {string_type(v, **str_kws)}")
 
-    for inp, v in zip(onx.graph.input, args):
-        onnx_results[inp.name] = to_numpy(v)
+    onnx_args = list(args) if args else []
+    if kwargs:
+        onnx_args.extend(flatten_object(kwargs, drop_keys=True))
+    if verbose:
+        print(f"[run_aligned]   args: {string_type(args, **str_kws)}")
+        print(f"[run_aligned] kwargs: {string_type(kwargs, **str_kws)}")
+        print(f"[run_aligned]   onnx: {string_type(onnx_args, **str_kws)}")
+        print(f"[run_aligned] walks through {len(onx.graph.input)} onnx inputs")
+    for inp, v in zip(onx.graph.input, onnx_args):
+        onnx_results[inp.name] = _check_tensor_(inp.name, v if use_tensor else to_numpy(v))
         if verbose:
-            print(
-                f"[run_aligned] +onnx-input: {inp.name}: "
-                f"{string_type(v, with_shape=True, with_min_max=True)}"
-            )
+            print(f"[run_aligned-nx] +inp: {inp.name}: {string_type(v, **str_kws)}")
 
-    for i, node in enumerate(ep.graph.nodes):
+    placeholders = {node.name for node in ep.graph.nodes if node.op == "placeholder"}
+    ep_state_dict = {**ep.state_dict, **dict(ep.named_buffers())}
+    placeholders_to_state_dict = {
+        **{f"p_{name.replace('.', '_')}": name for name in ep.state_dict},
+        **{f"b_{name.replace('.', '_')}": name for name, _ in ep.named_buffers()},
+    }
+    for n in onnx_results:
+        if n not in placeholders:
+            yield (
+                None,
+                -1,
+                None,
+                n,
+                None,
+                "initializer",
+                string_type(onnx_results[n], **str_kws),
+                {},
+            )
+    ep_graph_nodes = list(ep.graph.nodes)
+    for i, node in enumerate(ep_graph_nodes):
         if verbose:
             if node.op == "call_function":
                 print(
@@ -315,25 +615,62 @@ def post_process(obs):
                 print(f"[run_aligned] run ep.graph.nodes[{i}]: {node.op} -> {node.name!r}")
 
         if node.op == "placeholder":
-            if node.name in onnx_results:
-                torch_results[node.name] = torch.from_numpy(onnx_results[node.name].copy())
+            is_input = node.name in placeholders
+            if node.name in onnx_results and (
+                is_input
+                or ep_state_dict[placeholders_to_state_dict[node.name]].shape
+                == onnx_results[node.name]
+            ):
+                torch_results[node.name] = (
+                    onnx_results[node.name]
+                    if use_tensor
+                    else torch.from_numpy(onnx_results[node.name])
+                )
                 if verbose:
                     t = torch_results[node.name]
-                    print(
-                        f"[run_aligned] +torch {node.name}="
-                        f"{string_type(t, with_shape=True, with_min_max=True)}"
-                    )
-                continue
-            raise AssertionError(
-                f"unable to process node {node.op} -> {node.name!r} "
-                f"not in {sorted(onnx_results)}, len(args)={len(args)}, "
-                f"onx.graph.input={[i.name for i in onx.graph.input]}"
-            )
+                    print(f"[run_aligned-ep] =plh: {node.name}={string_type(t, **str_kws)}")
+                # Otherwise, it is an input.
+                yield (
+                    -1,
+                    -1,
+                    node.name,
+                    node.name,
+                    "input" if is_input else "placeholder",
+                    "input" if is_input else "initializer",
+                    string_type(t, **str_kws),
+                    (
+                        {}
+                        if is_input
+                        else max_diff(
+                            ep_state_dict[placeholders_to_state_dict[node.name]],
+                            onnx_results[node.name],
+                        )
+                    ),
+                )
+            else:
+                assert node.name in placeholders_to_state_dict, (
+                    f"Unable to find placeholder {node.name!r} in "
+                    f"{sorted(placeholders_to_state_dict)}"
+                )
+                torch_results[node.name] = ep_state_dict[placeholders_to_state_dict[node.name]]
+                if verbose:
+                    print(f"[run_aligned-ep] +plh: {node.name}={string_type(t, **str_kws)}")
+                yield (
+                    -1,
+                    None,
+                    node.name,
+                    None,
+                    "placeholder",
+                    None,
+                    string_type(torch_results[node.name], **str_kws),
+                    {},
+                )
+            continue
 
         outputs = [node.name] if isinstance(node.name, str) else list(node.name)
         args, kwargs = prepare_args_kwargs(torch_results, node)
         new_outputs = run_fx_node(node, args, kwargs)
-        if isinstance(new_outputs, (torch.Tensor, int, float, list)):
+        if isinstance(new_outputs, (torch.Tensor, int, float, list, tuple)):
             new_outputs = (new_outputs,)
 
         if new_outputs is None:
@@ -344,10 +681,7 @@ def post_process(obs):
             torch_results[k] = v
         if verbose:
             for k, v in zip(outputs, new_outputs):
-                print(
-                    f"[run_aligned] +torch {k}="
-                    f"{string_type(v, with_shape=True, with_min_max=True)}"
-                )
+                print(f"[run_aligned-ep] +res: {k}={string_type(v, **str_kws)}")
 
         max_pos = -2
         for n in outputs:
@@ -364,43 +698,58 @@ def post_process(obs):
                     f"[run_aligned] run onx.graph.node[{i_onnx}]: "
                     f"{node.op_type}({', '.join(node.input)}) -> {', '.join(node.output)}"
                 )
-            ref = cls(node)
+            ref = run_cls(node, **run_cls_kwargs)
             feeds = {k: onnx_results[k] for k in node.input}
-            res = ref.run(None, feeds)
+            res = ref.run(None, feeds)  # type: ignore[attr-defined]
+            assert (
+                not has_cuda
+                or not any(t is not None and t.is_cuda for t in feeds.values())
+                or any(t is not None and t.is_cuda for t in res)
+                or node.op_type in {"Shape", "Size"}  # on CPU no matter what
+                or node.op_type
+                in {
+                    "Add",
+                    "Concat",
+                    "Div",
+                    "Gather",
+                    "Mul",
+                    "Range",
+                    "Squeeze",
+                    "Sub",
+                    "Unsqueeze",
+                }  # not sure, could be about shapes
+            ), (
+                f"One input is on cuda but there is no float output on cuda, "
+                f"feeds={string_type(feeds, with_device=True, with_shape=True)}, "
+                f"res={string_type(res, with_device=True, with_shape=True)}, "
+                f"node is {pretty_onnx(node)}"
+            )
             for o, r in zip(node.output, res):
-                onnx_results[o] = r
-                if verbose:
-                    print(
-                        f"[run_aligned] +onnx {o}="
-                        f"{string_type(r, with_shape=True, with_min_max=True)}"
+                tmp = _loop_cmp(
+                    mapping_onnx_to_torch,
+                    torch_results,
+                    onnx_results,
+                    o,
+                    r,
+                    verbose,
+                    atol,
+                    rtol,
+                    i,
+                    i_onnx,
+                )
+                if tmp is not None:
+                    yield (
+                        *tmp[:4],
+                        str(ep_graph_nodes[tmp[0]].target),
+                        onx.graph.node[tmp[1]].op_type,
+                        *tmp[-2:],
                     )
 
-                to = mapping_onnx_to_torch.get(o, o)
-                if to in torch_results:
-                    d = max_diff(torch_results[to], r)
-                    if verbose:
-                        if o == to:
-                            print(f"[run_aligned] =common results {to}: {string_diff(d)}")
-                        else:
-                            print(f"[run_aligned] =common results {to}/{o}: {string_diff(d)}")
-                        if not (
-                            atol is None
-                            or rtol is None
-                            or (d["abs"] <= atol and d["rel"] <= rtol)
-                        ):
-                            skw = dict(with_shape=True, with_min_max=True)
-                            raise ValueError(
-                                f"discrepancies detected for results [{to}/{o}]: "
-                                f"{string_diff(d)}"
-                                f"\n-- torch_results: {string_type(torch_results[to], **skw)}"
-                                f"\n-- onnx_results: {string_type(r, **skw)}"
-                                f"\n-- torch\n{torch_results[to]}\n-- onnx\n{r}"
-                            )
-                    yield (i, i_onnx, o, to, d)
-
         last_position = max_pos + 1
 
     # complete the execution of the onnx graph
+    if verbose:
+        print(f"[run_aligned] complete execution of onnx graph from pos={last_position}")
     for i_onnx in range(last_position, len(onx.graph.node)):
         node = onx.graph.node[i_onnx]
         if verbose:
@@ -408,33 +757,26 @@ def post_process(obs):
                 f"[run_aligned] run onx.graph.node[{i_onnx}]: "
                 f"{node.op_type}({', '.join(node.input)}) -> {', '.join(node.output)}"
             )
-        ref = cls(node)
+        ref = run_cls(node, **run_cls_kwargs)
         feeds = {k: onnx_results[k] for k in node.input}
-        res = ref.run(None, feeds)
+        res = ref.run(None, feeds)  # type: ignore[attr-defined]
         for o, r in zip(node.output, res):
-            onnx_results[o] = r
-            if verbose:
-                print(
-                    f"[run_aligned] +onnx {o}="
-                    f"{string_type(r, with_shape=True, with_min_max=True)}"
+            tmp = _loop_cmp(
+                mapping_onnx_to_torch,
+                torch_results,
+                onnx_results,
+                o,
+                r,
+                verbose,
+                atol,
+                rtol,
+                i,
+                i_onnx,
+            )
+            if tmp is not None:
+                yield (
+                    *tmp[:4],
+                    str(ep_graph_nodes[tmp[0]].target),
+                    onx.graph.node[tmp[1]].op_type,
+                    *tmp[-2:],
                 )
-
-            to = mapping_onnx_to_torch.get(o, o)
-            if to in torch_results:
-                d = max_diff(torch_results[to], r)
-                if verbose:
-                    if o == to:
-                        print(f"[run_aligned] =common results* {to}: {string_diff(d)}")
-                    else:
-                        print(f"[run_aligned] =common results* {to}/{o}: {string_diff(d)}")
-                    if not (
-                        atol is None or rtol is None or (d["abs"] <= atol and d["rel"] <= rtol)
-                    ):
-                        skw = dict(with_shape=True, with_min_max=True)
-                        raise ValueError(
-                            f"discrepancies detected for results* [{to}/{o}]: {string_diff(d)}"
-                            f"\n-- torch_results: {string_type(torch_results[to], **skw)}"
-                            f"\n-- onnx_results: {string_type(r, **skw)}"
-                            f"\n-- torch\n{torch_results[to]}\n-- onnx\n{r}"
-                        )
-                yield (i, i_onnx, o, to, d)
diff --git a/pyproject.toml b/pyproject.toml
index deb8af51..2e5e4107 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "onnx-diagnostic"
-version = "0.8.2"
+version = "0.8.3"
 description = "Tools to help converting pytorch models into ONNX."
 readme = "README.rst"
 authors = [