From 731982af83be37a71709d048c6198b1e1528a571 Mon Sep 17 00:00:00 2001
From: xadupre <xadupre@microsoft.com>
Date: Sun, 8 Jun 2025 23:07:44 +0200
Subject: [PATCH 1/5] noinline

---
 onnx_diagnostic/torch_models/test_helper.py | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/onnx_diagnostic/torch_models/test_helper.py b/onnx_diagnostic/torch_models/test_helper.py
index c41009f1..57c17b35 100644
--- a/onnx_diagnostic/torch_models/test_helper.py
+++ b/onnx_diagnostic/torch_models/test_helper.py
@@ -1334,13 +1334,13 @@ def call_torch_export_custom(
         "custom-nostrict",
         "custom-nostrict-default",
         "custom-nostrict-all",
-        "custom-inline",
-        "custom-strict-inline",
-        "custom-strict-default-inline",
-        "custom-strict-all-inline",
-        "custom-nostrict-inline",
-        "custom-nostrict-default-inline",
-        "custom-nostrict-all-inline",
+        "custom-noinline",
+        "custom-strict-noinline",
+        "custom-strict-default-noinline",
+        "custom-strict-all-noinline",
+        "custom-nostrict-noinline",
+        "custom-nostrict-default-noinline",
+        "custom-nostrict-all-noinline",
     }
     assert exporter in available, f"Unexpected value for exporter={exporter!r} in {available}"
     assert "model" in data, f"model is missing from data: {sorted(data)}"
@@ -1381,10 +1381,7 @@ def call_torch_export_custom(
         ),
         save_ep=(os.path.join(dump_folder, f"{exporter}.ep") if dump_folder else None),
     )
-    inline = "-inline" in exporter
-    if inline:
-        export_options.aten_as_function = set()
-
+    inline = "-noinline" not in exporter
     options = OptimizationOptions(patterns=optimization) if optimization else None
     model = data["model"]
     kws = dict(

From 97eb9e5989590c2aea7b89bb81cf0199d0e8e446 Mon Sep 17 00:00:00 2001
From: xadupre <xadupre@microsoft.com>
Date: Sun, 8 Jun 2025 23:50:13 +0200
Subject: [PATCH 2/5] ut

---
 _unittests/ut_torch_models/test_test_helpers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/_unittests/ut_torch_models/test_test_helpers.py b/_unittests/ut_torch_models/test_test_helpers.py
index b16d5fbc..7a990896 100644
--- a/_unittests/ut_torch_models/test_test_helpers.py
+++ b/_unittests/ut_torch_models/test_test_helpers.py
@@ -176,7 +176,7 @@ def test_validate_model_custom_torch(self):
             mid,
             do_run=True,
             verbose=10,
-            exporter="custom-inline",
+            exporter="custom-noinline",
             dump_folder="dump_test_validate_model_custom_torch",
             patch=True,
             stop_if_static=2 if pv.Version(torch.__version__) > pv.Version("2.6.1") else 0,

From 8a7817273514bc9da4e2fc47cd19c1863967b84d Mon Sep 17 00:00:00 2001
From: xadupre <xadupre@microsoft.com>
Date: Mon, 9 Jun 2025 09:47:12 +0200
Subject: [PATCH 3/5] better graphs

---
 .../technical/plot_layer_norm_discrepancies.py | 17 ++++++++++++++---
 _doc/technical/plot_parallelized_reduction.py  |  1 +
 onnx_diagnostic/doc.py                         | 18 ++++++++++++++++++
 3 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/_doc/technical/plot_layer_norm_discrepancies.py b/_doc/technical/plot_layer_norm_discrepancies.py
index 4a4d3b96..3cdb475a 100644
--- a/_doc/technical/plot_layer_norm_discrepancies.py
+++ b/_doc/technical/plot_layer_norm_discrepancies.py
@@ -21,6 +21,7 @@
 import onnxruntime
 import torch
 from onnx_array_api.plotting.graphviz_helper import plot_dot
+from onnx_diagnostic.doc import rotate_align, save_fig
 from onnx_diagnostic.ext_test_case import unit_test_going
 from onnx_diagnostic.helpers import max_diff, string_diff, string_type
 from onnx_diagnostic.helpers.onnx_helper import onnx_dtype_name, onnx_dtype_to_np_dtype
@@ -143,7 +144,12 @@ def cast_feeds(itype, provider, feeds):
 # %%
 # Visually.
 
-df["abs"].plot.bar(title="Discrepancies ORT / torch for LayerNorm(X) @ W + B")
+save_fig(
+    rotate_align(
+        df[["abs"]].plot.bar(title="Discrepancies ORT / torch for LayerNorm(X) @ W + B")
+    ),
+    "plot_layer_norm_discrepancies_1.png",
+)
 
 # %%
 # The discrepancies are significant on CUDA, higher for float16.
@@ -207,6 +213,11 @@ def cast_feeds(itype, provider, feeds):
 # %%
 # Visually.
 
-df[["diff_ort", "diff_torch"]].plot.bar(
-    title="ORT/Torch or Torch/ORT for LayerNorm(X) @ W + B"
+save_fig(
+    rotate_align(
+        df[["diff_ort", "diff_torch"]].plot.bar(
+            title="ORT/Torch or Torch/ORT for LayerNorm(X) @ W + B"
+        )
+    ),
+    "plot_layer_norm_discrepancies_2.png",
 )
diff --git a/_doc/technical/plot_parallelized_reduction.py b/_doc/technical/plot_parallelized_reduction.py
index 25e91d44..f8bd27f4 100644
--- a/_doc/technical/plot_parallelized_reduction.py
+++ b/_doc/technical/plot_parallelized_reduction.py
@@ -188,6 +188,7 @@ def make_value(base, value):
 # Visually.
 
 ax = df.plot.bar(logy=True)
+ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
 fig = ax.get_figure()
 fig.savefig("plot_parallelized_reduction.png")
 
diff --git a/onnx_diagnostic/doc.py b/onnx_diagnostic/doc.py
index 5e517ad5..6eed62c8 100644
--- a/onnx_diagnostic/doc.py
+++ b/onnx_diagnostic/doc.py
@@ -30,3 +30,21 @@ def plot_legend(
     ax.grid(False)
     ax.set_axis_off()
     return ax
+
+
+def rotate_align(ax, angle=15, align="right"):
+    """Rotates x-label and align them to thr right. Returns ax."""
+    for label in ax.get_xticklabels():
+        label.set_rotation(angle)
+        label.set_horizontalalignment(align)
+    return ax
+
+
+def save_fig(ax, name: str):
+    """Applies ``tight_layout`` and saves the figures. Returns ax."""
+    import matplotlib.pyplot as plt
+
+    plt.tight_layout()
+    fig = ax.get_figure()
+    fig.savefig(name)
+    return ax

From fc664f8c4152922d541c164ed0a4613540152b58 Mon Sep 17 00:00:00 2001
From: xadupre <xadupre@microsoft.com>
Date: Mon, 9 Jun 2025 13:01:38 +0200
Subject: [PATCH 4/5] example

---
 .../plot_layer_norm_discrepancies.py          | 54 ++++++++++++++-----
 _doc/technical/plot_parallelized_reduction.py |  6 +++
 k.py                                          | 30 -----------
 onnx_diagnostic/doc.py                        | 28 ++++++++++
 onnx_diagnostic/helpers/doc_helper.py         | 28 ++++++++--
 5 files changed, 99 insertions(+), 47 deletions(-)
 delete mode 100644 k.py

diff --git a/_doc/technical/plot_layer_norm_discrepancies.py b/_doc/technical/plot_layer_norm_discrepancies.py
index 3cdb475a..de7a6e71 100644
--- a/_doc/technical/plot_layer_norm_discrepancies.py
+++ b/_doc/technical/plot_layer_norm_discrepancies.py
@@ -6,22 +6,26 @@
 :ref:`l-plot-parallelized-reduction`, reduction operations
 are sensitive to parallelization.
 
-We consider a small model including a layer normalization
-followed by a matrix multiplication and we show that replacing
-a kernel by another one may significantly impact the output.
+Methodology
++++++++++++
+
+We consider a simple model with a LayerNormalization followed by a MatMul.
+Each operator can be run with :epkg:`onnxruntime` or :epkg:`pytorch`.
+We compare the four combinations.
 
 The model
 +++++++++
 """
 
 import itertools
+import numpy as np
 import pandas
 import onnx
 import onnx.helper as oh
 import onnxruntime
 import torch
 from onnx_array_api.plotting.graphviz_helper import plot_dot
-from onnx_diagnostic.doc import rotate_align, save_fig
+from onnx_diagnostic.doc import rotate_align, save_fig, plot_histogram, title
 from onnx_diagnostic.ext_test_case import unit_test_going
 from onnx_diagnostic.helpers import max_diff, string_diff, string_type
 from onnx_diagnostic.helpers.onnx_helper import onnx_dtype_name, onnx_dtype_to_np_dtype
@@ -80,6 +84,8 @@ def make_feeds(last_dim: int):
 
 
 def cast_feeds(itype, provider, feeds):
+    ttype = onnx_dtype_to_torch_dtype(itype)
+    np_dtype = onnx_dtype_to_np_dtype(itype)
     np_feeds = {k: v.detach().numpy() for k, v in feeds.items()}
     if provider == "CUDA":
         if not torch.cuda.is_available():
@@ -102,8 +108,6 @@ def cast_feeds(itype, provider, feeds):
 baseline = {}
 
 for provider, itype in itertools.product(["CPU", "CUDA"], [TFLOAT, TFLOAT16]):
-    ttype = onnx_dtype_to_torch_dtype(itype)
-    np_dtype = onnx_dtype_to_np_dtype(itype)
     tch_feeds, ort_feeds = cast_feeds(itype, provider, feeds)
     if tch_feeds is None:
         continue
@@ -156,6 +160,22 @@ def cast_feeds(itype, provider, feeds):
 # Let's see which operator is responsible for them,
 # *LayerNormalization* or *MatMul*.
 
+# %%
+# Distribution of the results
+# +++++++++++++++++++++++++++
+
+tensor = baseline[TFLOAT16, "CPU", "ort"][0].ravel().astype(np.float32)
+print(pandas.DataFrame({"expected": tensor}).describe())
+
+# %%
+# Histogram.
+
+save_fig(
+    title(plot_histogram(tensor), "Distribution of the computed results"),
+    "plot_layer_norm_discrepancies_hist.png",
+)
+
+
 # %%
 # The discrepancies come from?
 # ++++++++++++++++++++++++++++
@@ -165,7 +185,7 @@ def cast_feeds(itype, provider, feeds):
 data = []
 
 for mod, provider, itype in itertools.product(
-    ["ORT-TORCH", "TORCH-ORT"], ["CPU", "CUDA"], [TFLOAT, TFLOAT16]
+    ["ORT-ORT", "ORT-TORCH", "TORCH-ORT", "TORCH-TORCH"], ["CPU", "CUDA"], [TFLOAT, TFLOAT16]
 ):
     ttype = onnx_dtype_to_torch_dtype(itype)
     np_dtype = onnx_dtype_to_np_dtype(itype)
@@ -173,11 +193,10 @@ def cast_feeds(itype, provider, feeds):
     if tch_feeds is None:
         continue
 
+    ker1, ker2 = mod.split("-")
     custom_kernels = (
-        {("", "LayerNormalization"): LayerNormalizationOrt}
-        if mod == "ORT-TORCH"
-        else {("", "MatMul"): MatMulOrt}
-    )
+        {("", "LayerNormalization"): LayerNormalizationOrt} if ker1 == "ORT" else {}
+    ) | ({("", "MatMul"): MatMulOrt} if ker2 == "ORT" else {})
 
     model = get_model(itype)
     print()
@@ -206,7 +225,7 @@ def cast_feeds(itype, provider, feeds):
     )
 
 # %%
-df = pandas.DataFrame(data).set_index(["model", "provider", "dtype"])
+df = pandas.DataFrame(data).set_index(["dtype", "provider", "model"])
 df = df.sort_index()
 print(df)
 
@@ -216,8 +235,17 @@ def cast_feeds(itype, provider, feeds):
 save_fig(
     rotate_align(
         df[["diff_ort", "diff_torch"]].plot.bar(
-            title="ORT/Torch or Torch/ORT for LayerNorm(X) @ W + B"
+            title="ORT/Torch or Torch/ORT for LayerNorm(X) @ W + B",
+            figsize=(10, 4),
         )
     ),
     "plot_layer_norm_discrepancies_2.png",
 )
+
+# %%
+# Conclusion
+# ++++++++++
+#
+# :epkg:`torch` seems able to replicate the same results if the same computation
+# is run multiple times. :epkg:`onnxruntime` is only able to do that on CUDA.
+# With float16 and CUDA, LayerNormalization seems to introduce some discrepancies.
diff --git a/_doc/technical/plot_parallelized_reduction.py b/_doc/technical/plot_parallelized_reduction.py
index f8bd27f4..18b314ea 100644
--- a/_doc/technical/plot_parallelized_reduction.py
+++ b/_doc/technical/plot_parallelized_reduction.py
@@ -23,6 +23,12 @@
 
 With :math:`\\mathbb{E}X = mean(X)`,
 :math:`\\mathbb{V}X = mean\\left(\\left(X - mean(X)\\right)^2\\right)`.
+
+Methodology
++++++++++++
+
+**Permutation should not change the average.**
+
 We draw 128 random permutations of X. The average or mean should not change.
 And the normalized vector should have the same values. In the first case, we compute
 the difference between the highest and the lowest values obtained for the average.
diff --git a/k.py b/k.py
deleted file mode 100644
index fd3cdc57..00000000
--- a/k.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import onnx
-import onnx.helper as oh
-import torch
-from onnx_diagnostic.helpers import string_type
-from onnx_diagnostic.reference import TorchOnnxEvaluator
-
-TFLOAT = onnx.TensorProto.FLOAT
-
-proto = oh.make_model(
-    oh.make_graph(
-        [
-            oh.make_node("Sigmoid", ["Y"], ["sy"]),
-            oh.make_node("Mul", ["Y", "sy"], ["ysy"]),
-            oh.make_node("Mul", ["X", "ysy"], ["final"]),
-        ],
-        "-nd-",
-        [
-            oh.make_tensor_value_info("X", TFLOAT, [1, "b", "c"]),
-            oh.make_tensor_value_info("Y", TFLOAT, ["a", "b", "c"]),
-        ],
-        [oh.make_tensor_value_info("final", TFLOAT, ["a", "b", "c"])],
-    ),
-    opset_imports=[oh.make_opsetid("", 18)],
-    ir_version=9,
-)
-
-sess = TorchOnnxEvaluator(proto, verbose=1)
-feeds = dict(X=torch.rand((4, 5)), Y=torch.rand((4, 5)))
-result = sess.run(None, feeds)
-print(string_type(result, with_shape=True, with_min_max=True))
diff --git a/onnx_diagnostic/doc.py b/onnx_diagnostic/doc.py
index 6eed62c8..85391826 100644
--- a/onnx_diagnostic/doc.py
+++ b/onnx_diagnostic/doc.py
@@ -1,3 +1,7 @@
+from typing import Optional
+import numpy as np
+
+
 def reset_torch_transformers(gallery_conf, fname):
     "Resets torch dynamo for :epkg:`sphinx-gallery`."
     import matplotlib.pyplot as plt
@@ -48,3 +52,27 @@ def save_fig(ax, name: str):
     fig = ax.get_figure()
     fig.savefig(name)
     return ax
+
+
+def title(ax: "plt.axes", title: str) -> "plt.axes":  # noqa: F821
+    "Adds a title to axes and returns them."
+    ax.set_title(title)
+    return ax
+
+
+def plot_histogram(
+    tensor: np.ndarray,
+    ax: Optional["plt.axes"] = None,  # noqa: F821
+    bins: int = 30,
+    color: str = "orange",
+    alpha: float = 0.7,
+) -> "plt.axes":  # noqa: F821
+    "Computes the distribution for a tensor."
+    if ax is None:
+        import matplotlib.pyplot as plt
+
+        ax = plt.gca()
+        ax.cla()
+    ax.hist(tensor, bins=30, color="orange", alpha=0.7)
+    ax.set_yscale("log")
+    return ax
diff --git a/onnx_diagnostic/helpers/doc_helper.py b/onnx_diagnostic/helpers/doc_helper.py
index cd9e7d97..97582018 100644
--- a/onnx_diagnostic/helpers/doc_helper.py
+++ b/onnx_diagnostic/helpers/doc_helper.py
@@ -1,3 +1,4 @@
+import os
 from typing import Dict, Optional, Tuple
 import onnx
 import onnx.helper as oh
@@ -6,6 +7,17 @@
 from .torch_helper import onnx_dtype_to_torch_dtype, torch_dtype_to_onnx_dtype
 from .ort_session import InferenceSessionForTorch
 
+_SAVED = []
+_SAVE_OPTIMIZED_MODEL_ = int(os.environ.get("DUMP_ONNX", "0"))
+
+
+def _get_model_name(op_name: str, provider: str) -> Optional[str]:
+    if _SAVE_OPTIMIZED_MODEL_:
+        name = f"dump_doc_layer_norm_{provider}_{len(_SAVED)}.onnx"
+        _SAVED.append(name)
+        return name
+    return None
+
 
 class LayerNormalizationOrt(OpRunKernel):
     "LayerNormalization with onnxruntime"
@@ -13,7 +25,7 @@ class LayerNormalizationOrt(OpRunKernel):
     @classmethod
     def device_dependent(cls) -> bool:
         "Needs device."
-        return False
+        return True
 
     def __init__(
         self,
@@ -70,7 +82,11 @@ def _make_model(self, itype: int, rank: int, has_bias: bool) -> onnx.ModelProto:
         )
         provider = "CPUExecutionProvider" if self.is_cpu else "CUDAExecutionProvider"
         self._provider = provider
-        return InferenceSessionForTorch(layer_model, providers=[provider])
+        return InferenceSessionForTorch(
+            layer_model,
+            optimized_model_filepath=_get_model_name("layer_norm", provider),
+            providers=[provider],
+        )
 
     def run(self, x, scale, bias=None):
         itype = torch_dtype_to_onnx_dtype(x.dtype)
@@ -94,7 +110,7 @@ class MatMulOrt(OpRunKernel):
     @classmethod
     def device_dependent(cls) -> bool:
         "Needs device."
-        return False
+        return True
 
     def __init__(
         self,
@@ -127,7 +143,11 @@ def _make_model(self, itype: int, ranka: int, rankb: int) -> onnx.ModelProto:
         )
         provider = "CPUExecutionProvider" if self.is_cpu else "CUDAExecutionProvider"
         self._provider = provider
-        return InferenceSessionForTorch(model, providers=[provider])
+        return InferenceSessionForTorch(
+            model,
+            optimized_model_filepath=_get_model_name("matmul", provider),
+            providers=[provider],
+        )
 
     def run(self, a, b):
         itype = torch_dtype_to_onnx_dtype(a.dtype)

From 2216d833aa549311951300f8e089ac3ccdf96462 Mon Sep 17 00:00:00 2001
From: xadupre <xadupre@microsoft.com>
Date: Mon, 9 Jun 2025 19:08:19 +0200
Subject: [PATCH 5/5] add missing BartModel

---
 .../ut_torch_export_patches/test_patch_rewrite.py   | 13 +++++++++++++
 onnx_diagnostic/helpers/doc_helper.py               |  4 ++--
 .../torch_export_patches/patch_module_helper.py     |  1 +
 onnx_diagnostic/torch_models/test_helper.py         |  8 ++++++++
 4 files changed, 24 insertions(+), 2 deletions(-)
 create mode 100644 _unittests/ut_torch_export_patches/test_patch_rewrite.py

diff --git a/_unittests/ut_torch_export_patches/test_patch_rewrite.py b/_unittests/ut_torch_export_patches/test_patch_rewrite.py
new file mode 100644
index 00000000..cd0cbd56
--- /dev/null
+++ b/_unittests/ut_torch_export_patches/test_patch_rewrite.py
@@ -0,0 +1,13 @@
+import unittest
+from onnx_diagnostic.ext_test_case import ExtTestCase
+from onnx_diagnostic.torch_export_patches.patch_module_helper import code_needing_rewriting
+
+
+class TestPatchRewrite(ExtTestCase):
+    def test_code_needing_rewriting(self):
+        res = code_needing_rewriting("BartModel")
+        self.assertEqual(len(res), 2)
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
diff --git a/onnx_diagnostic/helpers/doc_helper.py b/onnx_diagnostic/helpers/doc_helper.py
index 97582018..406a6047 100644
--- a/onnx_diagnostic/helpers/doc_helper.py
+++ b/onnx_diagnostic/helpers/doc_helper.py
@@ -1,5 +1,5 @@
 import os
-from typing import Dict, Optional, Tuple
+from typing import Dict, List, Optional, Tuple
 import onnx
 import onnx.helper as oh
 import torch
@@ -7,7 +7,7 @@
 from .torch_helper import onnx_dtype_to_torch_dtype, torch_dtype_to_onnx_dtype
 from .ort_session import InferenceSessionForTorch
 
-_SAVED = []
+_SAVED: List[str] = []
 _SAVE_OPTIMIZED_MODEL_ = int(os.environ.get("DUMP_ONNX", "0"))
 
 
diff --git a/onnx_diagnostic/torch_export_patches/patch_module_helper.py b/onnx_diagnostic/torch_export_patches/patch_module_helper.py
index 07830431..fe8c8b14 100644
--- a/onnx_diagnostic/torch_export_patches/patch_module_helper.py
+++ b/onnx_diagnostic/torch_export_patches/patch_module_helper.py
@@ -80,6 +80,7 @@ def known_transformers_rewritings_clamp_float16() -> Dict[str, str]:
         "AutoformerModel": "AutoformerEncoderLayer",
         "BartEncoderLayer": "BartEncoderLayer",
         "BartForConditionalGeneration": "BartEncoderLayer",
+        "BartModel": "BartEncoderLayer",
         "BigBirdPegasusForConditionalGeneration": "BigBirdPegasusEncoderLayer",
         "BigBirdPegasusForQuestionAnswering": "BigBirdPegasusEncoderLayer",
         "BigBirdPegasusForCausalLM": "BigBirdPegasusEncoderLayer",
diff --git a/onnx_diagnostic/torch_models/test_helper.py b/onnx_diagnostic/torch_models/test_helper.py
index 57c17b35..b577c58c 100644
--- a/onnx_diagnostic/torch_models/test_helper.py
+++ b/onnx_diagnostic/torch_models/test_helper.py
@@ -387,6 +387,12 @@ def validate_model(
         if model_options:
             print(f"[validate_model] model_options={model_options!r}")
         print(f"[validate_model] get dummy inputs with input_options={input_options}...")
+        print(
+            f"[validate_model] rewrite={rewrite}, patch={patch}, "
+            f"stop_if_static={stop_if_static}"
+        )
+        print(f"[validate_model] exporter={exporter!r}, optimization={optimization!r}")
+        print(f"[validate_model] dump_folder={dump_folder!r}")
         summary["model_id"] = model_id
         summary["model_subfolder"] = subfolder or ""
 
@@ -446,6 +452,8 @@ def validate_model(
                 print(f"[validate_model] model_rewrite={summary['model_rewrite']}")
         else:
             del data["rewrite"]
+            if verbose:
+                print("[validate_model] no rewrite")
     if os.environ.get("PRINT_CONFIG", "0") in (1, "1"):
         print("[validate_model] -- PRINT CONFIG")
         print("-- type(config)", type(data["configuration"]))