diff --git a/_doc/technical/plot_layer_norm_discrepancies.py b/_doc/technical/plot_layer_norm_discrepancies.py
index 4a4d3b96..de7a6e71 100644
--- a/_doc/technical/plot_layer_norm_discrepancies.py
+++ b/_doc/technical/plot_layer_norm_discrepancies.py
@@ -6,21 +6,26 @@
 :ref:`l-plot-parallelized-reduction`, reduction operations
 are sensitive to parallelization.
 
-We consider a small model including a layer normalization
-followed by a matrix multiplication and we show that replacing
-a kernel by another one may significantly impact the output.
+Methodology
++++++++++++
+
+We consider a simple model with a LayerNormalization followed by a MatMul.
+Each operator can be run with :epkg:`onnxruntime` or :epkg:`pytorch`.
+We compare the four combinations.
 
 The model
 +++++++++
 """
 
 import itertools
+import numpy as np
 import pandas
 import onnx
 import onnx.helper as oh
 import onnxruntime
 import torch
 from onnx_array_api.plotting.graphviz_helper import plot_dot
+from onnx_diagnostic.doc import rotate_align, save_fig, plot_histogram, title
 from onnx_diagnostic.ext_test_case import unit_test_going
 from onnx_diagnostic.helpers import max_diff, string_diff, string_type
 from onnx_diagnostic.helpers.onnx_helper import onnx_dtype_name, onnx_dtype_to_np_dtype
@@ -79,6 +84,8 @@ def make_feeds(last_dim: int):
 
 
 def cast_feeds(itype, provider, feeds):
+    ttype = onnx_dtype_to_torch_dtype(itype)
+    np_dtype = onnx_dtype_to_np_dtype(itype)
     np_feeds = {k: v.detach().numpy() for k, v in feeds.items()}
     if provider == "CUDA":
         if not torch.cuda.is_available():
@@ -101,8 +108,6 @@ def cast_feeds(itype, provider, feeds):
 baseline = {}
 
 for provider, itype in itertools.product(["CPU", "CUDA"], [TFLOAT, TFLOAT16]):
-    ttype = onnx_dtype_to_torch_dtype(itype)
-    np_dtype = onnx_dtype_to_np_dtype(itype)
     tch_feeds, ort_feeds = cast_feeds(itype, provider, feeds)
     if tch_feeds is None:
         continue
@@ -143,13 +148,34 @@ def cast_feeds(itype, provider, feeds):
 # %%
 # Visually.
 
-df["abs"].plot.bar(title="Discrepancies ORT / torch for LayerNorm(X) @ W + B")
+save_fig(
+    rotate_align(
+        df[["abs"]].plot.bar(title="Discrepancies ORT / torch for LayerNorm(X) @ W + B")
+    ),
+    "plot_layer_norm_discrepancies_1.png",
+)
 
 # %%
 # The discrepancies are significant on CUDA, higher for float16.
 # Let's see which operator is responsible for them,
 # *LayerNormalization* or *MatMul*.
 
+# %%
+# Distribution of the results
+# +++++++++++++++++++++++++++
+
+tensor = baseline[TFLOAT16, "CPU", "ort"][0].ravel().astype(np.float32)
+print(pandas.DataFrame({"expected": tensor}).describe())
+
+# %%
+# Histogram.
+
+save_fig(
+    title(plot_histogram(tensor), "Distribution of the computed results"),
+    "plot_layer_norm_discrepancies_hist.png",
+)
+
+
 # %%
 # The discrepancies come from?
 # ++++++++++++++++++++++++++++
@@ -159,7 +185,7 @@ def cast_feeds(itype, provider, feeds):
 data = []
 
 for mod, provider, itype in itertools.product(
-    ["ORT-TORCH", "TORCH-ORT"], ["CPU", "CUDA"], [TFLOAT, TFLOAT16]
+    ["ORT-ORT", "ORT-TORCH", "TORCH-ORT", "TORCH-TORCH"], ["CPU", "CUDA"], [TFLOAT, TFLOAT16]
 ):
     ttype = onnx_dtype_to_torch_dtype(itype)
     np_dtype = onnx_dtype_to_np_dtype(itype)
@@ -167,11 +193,10 @@ def cast_feeds(itype, provider, feeds):
     if tch_feeds is None:
         continue
 
+    ker1, ker2 = mod.split("-")
     custom_kernels = (
-        {("", "LayerNormalization"): LayerNormalizationOrt}
-        if mod == "ORT-TORCH"
-        else {("", "MatMul"): MatMulOrt}
-    )
+        {("", "LayerNormalization"): LayerNormalizationOrt} if ker1 == "ORT" else {}
+    ) | ({("", "MatMul"): MatMulOrt} if ker2 == "ORT" else {})
 
     model = get_model(itype)
     print()
@@ -200,13 +225,27 @@ def cast_feeds(itype, provider, feeds):
     )
 
 # %%
-df = pandas.DataFrame(data).set_index(["model", "provider", "dtype"])
+df = pandas.DataFrame(data).set_index(["dtype", "provider", "model"])
 df = df.sort_index()
 print(df)
 
 # %%
 # Visually.
 
-df[["diff_ort", "diff_torch"]].plot.bar(
-    title="ORT/Torch or Torch/ORT for LayerNorm(X) @ W + B"
+save_fig(
+    rotate_align(
+        df[["diff_ort", "diff_torch"]].plot.bar(
+            title="ORT/Torch or Torch/ORT for LayerNorm(X) @ W + B",
+            figsize=(10, 4),
+        )
+    ),
+    "plot_layer_norm_discrepancies_2.png",
 )
+
+# %%
+# Conclusion
+# ++++++++++
+#
+# :epkg:`torch` seems able to replicate the same results if the same computation
+# is run multiple times. :epkg:`onnxruntime` is only able to do that on CUDA.
+# With float16 and CUDA, LayerNormalization seems to introduce some discrepancies.
diff --git a/_doc/technical/plot_parallelized_reduction.py b/_doc/technical/plot_parallelized_reduction.py
index 25e91d44..18b314ea 100644
--- a/_doc/technical/plot_parallelized_reduction.py
+++ b/_doc/technical/plot_parallelized_reduction.py
@@ -23,6 +23,12 @@
 
 With :math:`\\mathbb{E}X = mean(X)`,
 :math:`\\mathbb{V}X = mean\\left(\\left(X - mean(X)\\right)^2\\right)`.
+
+Methodology
++++++++++++
+
+**Permutation should not change the average.**
+
 We draw 128 random permutations of X. The average or mean should not change.
 And the normalized vector should have the same values. In the first case, we compute
 the difference between the highest and the lowest values obtained for the average.
@@ -188,6 +194,7 @@ def make_value(base, value):
 # Visually.
 
 ax = df.plot.bar(logy=True)
+ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
 fig = ax.get_figure()
 fig.savefig("plot_parallelized_reduction.png")
 
diff --git a/_unittests/ut_torch_export_patches/test_patch_rewrite.py b/_unittests/ut_torch_export_patches/test_patch_rewrite.py
new file mode 100644
index 00000000..cd0cbd56
--- /dev/null
+++ b/_unittests/ut_torch_export_patches/test_patch_rewrite.py
@@ -0,0 +1,13 @@
+import unittest
+from onnx_diagnostic.ext_test_case import ExtTestCase
+from onnx_diagnostic.torch_export_patches.patch_module_helper import code_needing_rewriting
+
+
+class TestPatchRewrite(ExtTestCase):
+    def test_code_needing_rewriting(self):
+        res = code_needing_rewriting("BartModel")
+        self.assertEqual(len(res), 2)
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
diff --git a/_unittests/ut_torch_models/test_test_helpers.py b/_unittests/ut_torch_models/test_test_helpers.py
index b16d5fbc..7a990896 100644
--- a/_unittests/ut_torch_models/test_test_helpers.py
+++ b/_unittests/ut_torch_models/test_test_helpers.py
@@ -176,7 +176,7 @@ def test_validate_model_custom_torch(self):
             mid,
             do_run=True,
             verbose=10,
-            exporter="custom-inline",
+            exporter="custom-noinline",
             dump_folder="dump_test_validate_model_custom_torch",
             patch=True,
             stop_if_static=2 if pv.Version(torch.__version__) > pv.Version("2.6.1") else 0,
diff --git a/k.py b/k.py
deleted file mode 100644
index fd3cdc57..00000000
--- a/k.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import onnx
-import onnx.helper as oh
-import torch
-from onnx_diagnostic.helpers import string_type
-from onnx_diagnostic.reference import TorchOnnxEvaluator
-
-TFLOAT = onnx.TensorProto.FLOAT
-
-proto = oh.make_model(
-    oh.make_graph(
-        [
-            oh.make_node("Sigmoid", ["Y"], ["sy"]),
-            oh.make_node("Mul", ["Y", "sy"], ["ysy"]),
-            oh.make_node("Mul", ["X", "ysy"], ["final"]),
-        ],
-        "-nd-",
-        [
-            oh.make_tensor_value_info("X", TFLOAT, [1, "b", "c"]),
-            oh.make_tensor_value_info("Y", TFLOAT, ["a", "b", "c"]),
-        ],
-        [oh.make_tensor_value_info("final", TFLOAT, ["a", "b", "c"])],
-    ),
-    opset_imports=[oh.make_opsetid("", 18)],
-    ir_version=9,
-)
-
-sess = TorchOnnxEvaluator(proto, verbose=1)
-feeds = dict(X=torch.rand((4, 5)), Y=torch.rand((4, 5)))
-result = sess.run(None, feeds)
-print(string_type(result, with_shape=True, with_min_max=True))
diff --git a/onnx_diagnostic/doc.py b/onnx_diagnostic/doc.py
index 5e517ad5..85391826 100644
--- a/onnx_diagnostic/doc.py
+++ b/onnx_diagnostic/doc.py
@@ -1,3 +1,7 @@
+from typing import Optional
+import numpy as np
+
+
 def reset_torch_transformers(gallery_conf, fname):
     "Resets torch dynamo for :epkg:`sphinx-gallery`."
     import matplotlib.pyplot as plt
@@ -30,3 +34,45 @@ def plot_legend(
     ax.grid(False)
     ax.set_axis_off()
     return ax
+
+
+def rotate_align(ax, angle=15, align="right"):
+    """Rotates x-label and align them to thr right. Returns ax."""
+    for label in ax.get_xticklabels():
+        label.set_rotation(angle)
+        label.set_horizontalalignment(align)
+    return ax
+
+
+def save_fig(ax, name: str):
+    """Applies ``tight_layout`` and saves the figures. Returns ax."""
+    import matplotlib.pyplot as plt
+
+    plt.tight_layout()
+    fig = ax.get_figure()
+    fig.savefig(name)
+    return ax
+
+
+def title(ax: "plt.axes", title: str) -> "plt.axes":  # noqa: F821
+    "Adds a title to axes and returns them."
+    ax.set_title(title)
+    return ax
+
+
+def plot_histogram(
+    tensor: np.ndarray,
+    ax: Optional["plt.axes"] = None,  # noqa: F821
+    bins: int = 30,
+    color: str = "orange",
+    alpha: float = 0.7,
+) -> "plt.axes":  # noqa: F821
+    "Computes the distribution for a tensor."
+    if ax is None:
+        import matplotlib.pyplot as plt
+
+        ax = plt.gca()
+        ax.cla()
+    ax.hist(tensor, bins=30, color="orange", alpha=0.7)
+    ax.set_yscale("log")
+    return ax
diff --git a/onnx_diagnostic/helpers/doc_helper.py b/onnx_diagnostic/helpers/doc_helper.py
index cd9e7d97..406a6047 100644
--- a/onnx_diagnostic/helpers/doc_helper.py
+++ b/onnx_diagnostic/helpers/doc_helper.py
@@ -1,4 +1,5 @@
-from typing import Dict, Optional, Tuple
+import os
+from typing import Dict, List, Optional, Tuple
 import onnx
 import onnx.helper as oh
 import torch
@@ -6,6 +7,17 @@
 from .torch_helper import onnx_dtype_to_torch_dtype, torch_dtype_to_onnx_dtype
 from .ort_session import InferenceSessionForTorch
 
+_SAVED: List[str] = []
+_SAVE_OPTIMIZED_MODEL_ = int(os.environ.get("DUMP_ONNX", "0"))
+
+
+def _get_model_name(op_name: str, provider: str) -> Optional[str]:
+    if _SAVE_OPTIMIZED_MODEL_:
+        name = f"dump_doc_layer_norm_{provider}_{len(_SAVED)}.onnx"
+        _SAVED.append(name)
+        return name
+    return None
+
 
 class LayerNormalizationOrt(OpRunKernel):
     "LayerNormalization with onnxruntime"
@@ -13,7 +25,7 @@ class LayerNormalizationOrt(OpRunKernel):
     @classmethod
     def device_dependent(cls) -> bool:
         "Needs device."
-        return False
+        return True
 
     def __init__(
         self,
@@ -70,7 +82,11 @@ def _make_model(self, itype: int, rank: int, has_bias: bool) -> onnx.ModelProto:
         )
         provider = "CPUExecutionProvider" if self.is_cpu else "CUDAExecutionProvider"
         self._provider = provider
-        return InferenceSessionForTorch(layer_model, providers=[provider])
+        return InferenceSessionForTorch(
+            layer_model,
+            optimized_model_filepath=_get_model_name("layer_norm", provider),
+            providers=[provider],
+        )
 
     def run(self, x, scale, bias=None):
         itype = torch_dtype_to_onnx_dtype(x.dtype)
@@ -94,7 +110,7 @@ class MatMulOrt(OpRunKernel):
     @classmethod
     def device_dependent(cls) -> bool:
         "Needs device."
-        return False
+        return True
 
     def __init__(
         self,
@@ -127,7 +143,11 @@ def _make_model(self, itype: int, ranka: int, rankb: int) -> onnx.ModelProto:
         )
         provider = "CPUExecutionProvider" if self.is_cpu else "CUDAExecutionProvider"
         self._provider = provider
-        return InferenceSessionForTorch(model, providers=[provider])
+        return InferenceSessionForTorch(
+            model,
+            optimized_model_filepath=_get_model_name("matmul", provider),
+            providers=[provider],
+        )
 
     def run(self, a, b):
         itype = torch_dtype_to_onnx_dtype(a.dtype)
diff --git a/onnx_diagnostic/torch_export_patches/patch_module_helper.py b/onnx_diagnostic/torch_export_patches/patch_module_helper.py
index 07830431..fe8c8b14 100644
--- a/onnx_diagnostic/torch_export_patches/patch_module_helper.py
+++ b/onnx_diagnostic/torch_export_patches/patch_module_helper.py
@@ -80,6 +80,7 @@ def known_transformers_rewritings_clamp_float16() -> Dict[str, str]:
         "AutoformerModel": "AutoformerEncoderLayer",
         "BartEncoderLayer": "BartEncoderLayer",
         "BartForConditionalGeneration": "BartEncoderLayer",
+        "BartModel": "BartEncoderLayer",
         "BigBirdPegasusForConditionalGeneration": "BigBirdPegasusEncoderLayer",
         "BigBirdPegasusForQuestionAnswering": "BigBirdPegasusEncoderLayer",
         "BigBirdPegasusForCausalLM": "BigBirdPegasusEncoderLayer",
diff --git a/onnx_diagnostic/torch_models/test_helper.py b/onnx_diagnostic/torch_models/test_helper.py
index c41009f1..b577c58c 100644
--- a/onnx_diagnostic/torch_models/test_helper.py
+++ b/onnx_diagnostic/torch_models/test_helper.py
@@ -387,6 +387,12 @@ def validate_model(
         if model_options:
             print(f"[validate_model] model_options={model_options!r}")
         print(f"[validate_model] get dummy inputs with input_options={input_options}...")
+        print(
+            f"[validate_model] rewrite={rewrite}, patch={patch}, "
+            f"stop_if_static={stop_if_static}"
+        )
+        print(f"[validate_model] exporter={exporter!r}, optimization={optimization!r}")
+        print(f"[validate_model] dump_folder={dump_folder!r}")
         summary["model_id"] = model_id
         summary["model_subfolder"] = subfolder or ""
 
@@ -446,6 +452,8 @@ def validate_model(
                 print(f"[validate_model] model_rewrite={summary['model_rewrite']}")
         else:
             del data["rewrite"]
+            if verbose:
+                print("[validate_model] no rewrite")
     if os.environ.get("PRINT_CONFIG", "0") in (1, "1"):
         print("[validate_model] -- PRINT CONFIG")
         print("-- type(config)", type(data["configuration"]))
@@ -1334,13 +1342,13 @@ def call_torch_export_custom(
         "custom-nostrict",
         "custom-nostrict-default",
         "custom-nostrict-all",
-        "custom-inline",
-        "custom-strict-inline",
-        "custom-strict-default-inline",
-        "custom-strict-all-inline",
-        "custom-nostrict-inline",
-        "custom-nostrict-default-inline",
-        "custom-nostrict-all-inline",
+        "custom-noinline",
+        "custom-strict-noinline",
+        "custom-strict-default-noinline",
+        "custom-strict-all-noinline",
+        "custom-nostrict-noinline",
+        "custom-nostrict-default-noinline",
+        "custom-nostrict-all-noinline",
     }
     assert exporter in available, f"Unexpected value for exporter={exporter!r} in {available}"
     assert "model" in data, f"model is missing from data: {sorted(data)}"
@@ -1381,10 +1389,7 @@ def call_torch_export_custom(
         ),
         save_ep=(os.path.join(dump_folder, f"{exporter}.ep") if dump_folder else None),
     )
-    inline = "-inline" in exporter
-    if inline:
-        export_options.aten_as_function = set()
-
+    inline = "-noinline" not in exporter
     options = OptimizationOptions(patterns=optimization) if optimization else None
     model = data["model"]
     kws = dict(