diff --git a/.gitignore b/.gitignore
index c6ea96f1..6a452352 100644
--- a/.gitignore
+++ b/.gitignore
@@ -29,3 +29,9 @@ examples/onnxruntime_profile*.json
 version.txt
 _doc/bench/*.svg
 _doc/examples/*.svg
+_doc/bench/*.html
+_doc/examples/*.html
+_doc/examples/*.json
+_doc/sphinxdoc/source/phdoc_static/*.js
+_doc/sphinxdoc/source/phdoc_static/reveal.js/*
+_doc/sphinxdoc/source/phdoc_static/style_notebook_snippet.css
diff --git a/_doc/bench/bench_ortmodule_nn_gpu.py b/_doc/bench/bench_ortmodule_nn_gpu.py
new file mode 100644
index 00000000..d94d682b
--- /dev/null
+++ b/_doc/bench/bench_ortmodule_nn_gpu.py
@@ -0,0 +1,218 @@
+"""
+
+.. _l-orttraining-nn-benchmark:
+
+Benchmark ORTModule on a neural network
+=======================================
+
+To make it work, you may need to run:
+
+::
+
+    python -c "from onnxruntime.training.ortmodule.torch_cpp_extensions import install as ortmodule_install;ortmodule_install.build_torch_cpp_extensions()"
+
+You may profile the full example with on CPU with :epkg:`py-spy`:
+
+::
+
+    py-spy record -o bench_ortmodule_nn_gpu.svg -r 10 --native -- python bench_ortmodule_nn_gpu.py
+    py-spy record -o bench_ortmodule_nn_gpu.svg -r 20 -- python bench_ortmodule_nn_gpu.py --n_features 100 --hidden_layer_sizes "30,30"
+
+The python can be profiled with :epkg:`pyinstrument`.
+
+::
+
+    python -m pyinstrument  --show-all -r html -o bench_ortmodule_nn_gpu.html bench_ortmodule_nn_gpu.py --n_features 100 --hidden_layer_sizes "30,30"
+
+And with `nvprof` on GPU:
+
+::
+
+    nvprof -o bench_ortmodule_nn_gpu.nvprof python bench_ortmodule_nn_gpu.py --run_torch 0 --device cuda --opset 14
+
+.. contents::
+    :local:
+
+A neural network with scikit-learn
+++++++++++++++++++++++++++++++++++
+
+"""
+import warnings
+from pprint import pprint
+import time
+import numpy
+from pandas import DataFrame
+from onnxruntime import get_device
+from sklearn.datasets import make_regression
+from sklearn.model_selection import train_test_split
+from sklearn.utils import shuffle
+import torch
+from torch.autograd import Variable
+import torch.nn.functional as F
+from onnxruntime.training import ORTModule
+
+
+def benchmark(N=1000, n_features=20, hidden_layer_sizes="26,25", max_iter=1000,
+              learning_rate_init=1e-4, batch_size=100, run_torch=True,
+              device='cpu', opset=12):
+    """
+    Compares :epkg:`onnxruntime-training` to :epkg:`scikit-learn` for
+    training. Training algorithm is SGD.
+
+    :param N: number of observations to train on
+    :param n_features: number of features
+    :param hidden_layer_sizes: hidden layer sizes, comma separated values
+    :param max_iter: number of iterations
+    :param learning_rate_init: initial learning rate
+    :param batch_size: batch size
+    :param run_torch: train scikit-learn in the same condition (True) or
+        just walk through one iterator with *scikit-learn*
+    :param device: `'cpu'` or `'cuda'`
+    :param opset: opset to choose for the conversion
+    """
+    N = int(N)
+    n_features = int(n_features)
+    max_iter = int(max_iter)
+    learning_rate_init = float(learning_rate_init)
+    batch_size = int(batch_size)
+    run_torch = run_torch in (1, True, '1', 'True')
+
+    print("N=%d" % N)
+    print("n_features=%d" % n_features)
+    print("hidden_layer_sizes=%r" % (hidden_layer_sizes, ))
+    print("max_iter=%d" % max_iter)
+    print("learning_rate_init=%f" % learning_rate_init)
+    print("batch_size=%d" % batch_size)
+    print("run_torch=%r" % run_torch)
+    print("opset=%r (unused)" % opset)
+    print("device=%r" % device)
+    device0 = device
+    device = torch.device(
+        "cuda:0" if device in ('cuda', 'cuda:0', 'gpu') else "cpu")
+    print("fixed device=%r" % device)
+    print('------------------')
+
+    if not isinstance(hidden_layer_sizes, tuple):
+        hidden_layer_sizes = tuple(map(int, hidden_layer_sizes.split(",")))
+    X, y = make_regression(N, n_features=n_features, bias=2)
+    X = X.astype(numpy.float32)
+    y = y.astype(numpy.float32)
+    X_train, X_test, y_train, y_test = train_test_split(X, y)
+
+
+    class Net(torch.nn.Module):
+        def __init__(self, n_features, hidden, n_output):
+            super(Net, self).__init__()
+            self.hidden = []
+            
+            size = n_features
+            for i, hid in enumerate(hidden_layer_sizes):
+                self.hidden.append(torch.nn.Linear(size, hid))
+                size = hid
+                setattr(self, "hid%d" % i, self.hidden[-1])
+            self.hidden.append(torch.nn.Linear(size, n_output))
+            setattr(self, "predict", self.hidden[-1])
+
+        def forward(self, x):
+            for hid in self.hidden:
+                x = hid(x)
+                x = F.relu(x)
+            return x
+
+    nn = Net(n_features, hidden_layer_sizes, 1)
+    if device0 == 'cpu':
+        nn.cpu()
+    else:
+        nn.cuda(device=device)
+    print("n_parameters=%d, n_layers=%d" % (
+        len(list(nn.parameters())), len(nn.hidden)))
+    for i, p in enumerate(nn.parameters()):
+        print("  p[%d].shape=%r" % (i, p.shape))
+    
+    optimizer = torch.optim.SGD(nn.parameters(), lr=learning_rate_init)
+    criterion = torch.nn.MSELoss(size_average=False)
+    batch_no = len(X_train) // batch_size
+
+    # training
+
+    def train_torch():
+        for epoch in range(max_iter):
+            running_loss = 0.0
+            x, y = shuffle(X_train, y_train)
+            for i in range(batch_no):
+                start = i * batch_size
+                end = start + batch_size
+                inputs = torch.tensor(
+                    x[start:end], requires_grad=True, device=device)
+                labels = torch.tensor(
+                    y[start:end], requires_grad=True, device=device)
+
+                def step_torch():
+                    optimizer.zero_grad()
+                    outputs = nn(inputs)
+                    loss = criterion(outputs, torch.unsqueeze(labels, dim=1))
+                    loss.backward()
+                    optimizer.step()
+                    return loss
+
+                loss = step_torch()
+                running_loss += loss.item()
+        return running_loss
+
+    begin = time.perf_counter()
+    if run_torch:
+        running_loss = train_torch()
+    dur_torch = time.perf_counter() - begin
+
+    if run_torch:
+        print("time_torch=%r, running_loss=%r" % (dur_torch, running_loss))
+        running_loss0 = running_loss
+    else:
+        running_loss0 = -1
+
+    # ORTModule
+    nn = Net(n_features, hidden_layer_sizes, 1)
+    if device0 == 'cpu':
+        nn.cpu()
+    else:
+        nn.cuda(device=device)
+
+    nn_ort = ORTModule(nn)
+    optimizer = torch.optim.SGD(nn_ort.parameters(), lr=learning_rate_init)
+    criterion = torch.nn.MSELoss(size_average=False)    
+
+    def train_ort():
+        for epoch in range(max_iter):
+            running_loss = 0.0
+            x, y = shuffle(X_train, y_train)
+            for i in range(batch_no):
+                start = i * batch_size
+                end = start + batch_size
+                inputs = torch.tensor(
+                    x[start:end], requires_grad=True, device=device)
+                labels = torch.tensor(
+                    y[start:end], requires_grad=True, device=device)
+
+                def step_ort():
+                    optimizer.zero_grad()
+                    outputs = nn_ort(inputs)
+                    loss = criterion(outputs, torch.unsqueeze(labels, dim=1))
+                    loss.backward()
+                    optimizer.step()
+                    return loss
+
+                loss = step_ort()
+                running_loss += loss.item()
+        return running_loss
+
+    begin = time.perf_counter()
+    running_loss = train_ort()
+    dur_ort = time.perf_counter() - begin
+
+    print("time_torch=%r, running_loss=%r" % (dur_torch, running_loss0))
+    print("time_ort=%r, last_trained_error=%r" % (dur_ort, running_loss))
+
+
+if __name__ == "__main__":
+    import fire
+    fire.Fire(benchmark)
diff --git a/_doc/bench/bench_orttraining_nn_gpu.py b/_doc/bench/bench_orttraining_nn_gpu.py
index e5bbc99e..8d70a6c4 100644
--- a/_doc/bench/bench_orttraining_nn_gpu.py
+++ b/_doc/bench/bench_orttraining_nn_gpu.py
@@ -66,7 +66,7 @@ def benchmark(N=1000, n_features=20, hidden_layer_sizes="25,25", max_iter=1000,
 
     print("N=%d" % N)
     print("n_features=%d" % n_features)
-    print("hidden_layer_sizes=%s" % hidden_layer_sizes)
+    print("hidden_layer_sizes=%r" % (hidden_layer_sizes, ))
     print("max_iter=%d" % max_iter)
     print("learning_rate_init=%f" % learning_rate_init)
     print("batch_size=%d" % batch_size)
@@ -75,7 +75,8 @@ def benchmark(N=1000, n_features=20, hidden_layer_sizes="25,25", max_iter=1000,
     print("device=%r" % device)
     print('------------------')
 
-    hidden_layer_sizes = tuple(map(int, hidden_layer_sizes.split(",")))
+    if not isinstance(hidden_layer_sizes, tuple):
+        hidden_layer_sizes = tuple(map(int, hidden_layer_sizes.split(",")))
     X, y = make_regression(N, n_features=n_features, bias=2)
     X = X.astype(numpy.float32)
     y = y.astype(numpy.float32)
@@ -92,7 +93,7 @@ def benchmark(N=1000, n_features=20, hidden_layer_sizes="25,25", max_iter=1000,
         nn.fit(X_train, y_train)
     dur_skl = time.perf_counter() - begin
 
-    print("time_kl=%r, mean_squared_error=%r" % (
+    print("time_skl=%r, mean_squared_error=%r" % (
         dur_skl, mean_squared_error(y_train, nn.predict(X_train))))
 
     # conversion to ONNX
@@ -119,7 +120,7 @@ def benchmark(N=1000, n_features=20, hidden_layer_sizes="25,25", max_iter=1000,
     begin = time.perf_counter()
     train_session.fit(X, y)
     dur_ort = time.perf_counter() - begin
-    print("time_kl=%r, mean_squared_error=%r" % (
+    print("time_skl=%r, mean_squared_error=%r" % (
         dur_skl, mean_squared_error(y_train, nn.predict(X_train))))
     print("time_ort=%r, last_trained_error=%r" % (
         dur_ort, train_session.train_losses_[-1]))
diff --git a/_doc/sphinxdoc/source/api.rst b/_doc/sphinxdoc/source/api.rst
deleted file mode 100644
index 092ac3e2..00000000
--- a/_doc/sphinxdoc/source/api.rst
+++ /dev/null
@@ -1,24 +0,0 @@
-
-===
-API
-===
-
-.. contents::
-    :local:
-
-Data
-++++
-
-.. autoclass:: onnxcustom.training.data_loader.OrtDataLoader
-
-Training
-++++++++
-
-.. autofunction:: onnxcustom.training.optimizers.OrtGradientOptimizer
-
-.. autofunction:: onnxcustom.training.orttraining.add_loss_output
-
-Utils
-+++++
-
-.. autofunction:: onnxcustom.utils.measure_time
diff --git a/_doc/sphinxdoc/source/api/data_loader.rst b/_doc/sphinxdoc/source/api/data_loader.rst
new file mode 100644
index 00000000..4a334164
--- /dev/null
+++ b/_doc/sphinxdoc/source/api/data_loader.rst
@@ -0,0 +1,5 @@
+
+DataLoder
+=========
+
+.. autoclass:: onnxcustom.training.data_loader.OrtDataLoader
diff --git a/_doc/sphinxdoc/source/api/index.rst b/_doc/sphinxdoc/source/api/index.rst
new file mode 100644
index 00000000..2ba2d057
--- /dev/null
+++ b/_doc/sphinxdoc/source/api/index.rst
@@ -0,0 +1,10 @@
+
+===
+API
+===
+
+.. toctree::
+
+    utils
+    data_loader
+    training
diff --git a/_doc/sphinxdoc/source/api/training.rst b/_doc/sphinxdoc/source/api/training.rst
new file mode 100644
index 00000000..a64ecbb4
--- /dev/null
+++ b/_doc/sphinxdoc/source/api/training.rst
@@ -0,0 +1,23 @@
+
+Traning
+=======
+
+.. contents::
+    :local:
+
+BaseEstimator
++++++++++++++
+
+.. autofunction:: onnxcustom.training.optimizers.BaseEstimator
+
+OrtGradientOptimizer
+++++++++++++++++++++
+
+.. autofunction:: onnxcustom.training.optimizers.OrtGradientOptimizer
+
+Helpers
++++++++
+
+.. autofunction:: onnxcustom.training.orttraining.add_loss_output
+
+.. autofunction:: onnxcustom.training.orttraining.get_train_initializer
diff --git a/_doc/sphinxdoc/source/api/utils.rst b/_doc/sphinxdoc/source/api/utils.rst
new file mode 100644
index 00000000..c29bfe4c
--- /dev/null
+++ b/_doc/sphinxdoc/source/api/utils.rst
@@ -0,0 +1,5 @@
+
+Utils
+=====
+
+.. autofunction:: onnxcustom.utils.measure_time
diff --git a/_doc/sphinxdoc/source/conf.py b/_doc/sphinxdoc/source/conf.py
index f5d52481..349d256b 100644
--- a/_doc/sphinxdoc/source/conf.py
+++ b/_doc/sphinxdoc/source/conf.py
@@ -19,6 +19,10 @@
                          issue=('https://github.com/sdpython/onnxcustom/issues/%s', 'issue')),
                      title="onnxcustom", book=True)
 
+extensions.append([
+    "sphinxcontrib.blockdiag"
+])
+
 blog_root = "http://www.xavierdupre.fr/app/onnxcustom/helpsphinx/"
 
 html_css_files = ['my-styles.css']
@@ -88,6 +92,7 @@
     'onnxruntime-training':
         'https://github.com/microsoft/onnxruntime/tree/master/orttraining',
     'openmp': 'https://en.wikipedia.org/wiki/OpenMP',
+    'py-spy': 'https://github.com/benfred/py-spy',
     'pyinstrument': 'https://github.com/joerick/pyinstrument',
     'python': 'https://www.python.org/',
     'pytorch': 'https://pytorch.org/',
@@ -97,6 +102,7 @@
     'sphinx-gallery': 'https://github.com/sphinx-gallery/sphinx-gallery',
     'Stochastic Gradient Descent':
         'https://en.wikipedia.org/wiki/Stochastic_gradient_descent',
+    'tqdm': 'https://github.com/tqdm/tqdm',
     'TreeEnsembleRegressor':
         'https://github.com/onnx/onnx/blob/master/docs/Operators-ml.md'
         '#ai.onnx.ml.TreeEnsembleRegressor',
diff --git a/_doc/sphinxdoc/source/index.rst b/_doc/sphinxdoc/source/index.rst
index 8b2c092f..42c97bc1 100644
--- a/_doc/sphinxdoc/source/index.rst
+++ b/_doc/sphinxdoc/source/index.rst
@@ -51,9 +51,9 @@ operators.
 .. toctree::
     :maxdepth: 1
 
-    tutorial
+    tutorial/index
     doc
-    api
+    api/index
     auto_examples/index
     dev
     versions
diff --git a/_doc/sphinxdoc/source/tutorial.rst b/_doc/sphinxdoc/source/tutorial/tutorial.rst
similarity index 100%
rename from _doc/sphinxdoc/source/tutorial.rst
rename to _doc/sphinxdoc/source/tutorial/tutorial.rst
diff --git a/_doc/sphinxdoc/source/tutorial_1-5_external.rst b/_doc/sphinxdoc/source/tutorial/tutorial_1-5_external.rst
similarity index 82%
rename from _doc/sphinxdoc/source/tutorial_1-5_external.rst
rename to _doc/sphinxdoc/source/tutorial/tutorial_1-5_external.rst
index a2e1631f..bc0fd95b 100644
--- a/_doc/sphinxdoc/source/tutorial_1-5_external.rst
+++ b/_doc/sphinxdoc/source/tutorial/tutorial_1-5_external.rst
@@ -11,5 +11,5 @@ model are part of a pipeline.
 .. toctree::
     :maxdepth: 1
 
-    auto_examples/plot_gexternal_lightgbm
-    auto_examples/plot_gexternal_xgboost
+    ../gyexamples/plot_gexternal_lightgbm
+    ../gyexamples/plot_gexternal_xgboost
diff --git a/_doc/sphinxdoc/source/tutorial_1_simple.rst b/_doc/sphinxdoc/source/tutorial/tutorial_1_simple.rst
similarity index 52%
rename from _doc/sphinxdoc/source/tutorial_1_simple.rst
rename to _doc/sphinxdoc/source/tutorial/tutorial_1_simple.rst
index 92d1ee8c..52787380 100644
--- a/_doc/sphinxdoc/source/tutorial_1_simple.rst
+++ b/_doc/sphinxdoc/source/tutorial/tutorial_1_simple.rst
@@ -14,13 +14,13 @@ used in the ONNX graph.
 .. toctree::
     :maxdepth: 1
 
-    auto_examples/plot_abegin_convert_pipeline
-    auto_examples/plot_bbegin_measure_time
-    auto_examples/plot_cbegin_opset
-    auto_examples/plot_dbegin_options
-    auto_examples/plot_dbegin_options_list
-    auto_examples/plot_ebegin_float_double
-    auto_examples/plot_fbegin_investigate
-    auto_examples/plot_gbegin_dataframe
-    auto_examples/plot_gbegin_transfer_learning
-    auto_examples/plot_gbegin_cst
+    ../gyexamples/plot_abegin_convert_pipeline
+    ../gyexamples/plot_bbegin_measure_time
+    ../gyexamples/plot_cbegin_opset
+    ../gyexamples/plot_dbegin_options
+    ../gyexamples/plot_dbegin_options_list
+    ../gyexamples/plot_ebegin_float_double
+    ../gyexamples/plot_fbegin_investigate
+    ../gyexamples/plot_gbegin_dataframe
+    ../gyexamples/plot_gbegin_transfer_learning
+    ../gyexamples/plot_gbegin_cst
diff --git a/_doc/sphinxdoc/source/tutorial_2_new_converter.rst b/_doc/sphinxdoc/source/tutorial/tutorial_2_new_converter.rst
similarity index 71%
rename from _doc/sphinxdoc/source/tutorial_2_new_converter.rst
rename to _doc/sphinxdoc/source/tutorial/tutorial_2_new_converter.rst
index 87f1c333..ee45edf5 100644
--- a/_doc/sphinxdoc/source/tutorial_2_new_converter.rst
+++ b/_doc/sphinxdoc/source/tutorial/tutorial_2_new_converter.rst
@@ -29,11 +29,11 @@ Following section shows how to create a custom converter.
 .. toctree::
     :maxdepth: 1
 
-    auto_examples/plot_icustom_converter
-    auto_examples/plot_jcustom_syntax
-    auto_examples/plot_kcustom_converter_wrapper
-    auto_examples/plot_lcustom_options
-    auto_examples/plot_mcustom_parser
-    auto_examples/plot_mcustom_parser_dataframe
-    auto_examples/plot_catwoe_transformer
-    auto_examples/plot_woe_transformer
+    ../gyexamples/plot_icustom_converter
+    ../gyexamples/plot_jcustom_syntax
+    ../gyexamples/plot_kcustom_converter_wrapper
+    ../gyexamples/plot_lcustom_options
+    ../gyexamples/plot_mcustom_parser
+    ../gyexamples/plot_mcustom_parser_dataframe
+    ../gyexamples/plot_catwoe_transformer
+    ../gyexamples/plot_woe_transformer
diff --git a/_doc/sphinxdoc/source/tutorial_3_new_operator.rst b/_doc/sphinxdoc/source/tutorial/tutorial_3_new_operator.rst
similarity index 89%
rename from _doc/sphinxdoc/source/tutorial_3_new_operator.rst
rename to _doc/sphinxdoc/source/tutorial/tutorial_3_new_operator.rst
index 0d55f5af..e647b078 100644
--- a/_doc/sphinxdoc/source/tutorial_3_new_operator.rst
+++ b/_doc/sphinxdoc/source/tutorial/tutorial_3_new_operator.rst
@@ -19,5 +19,5 @@ That's the difficult part.
 .. toctree::
     :maxdepth: 1
 
-    auto_examples/plot_pextend_python_runtime
-    auto_examples/plot_qextend_onnxruntime
+    ../gyexamples/plot_pextend_python_runtime
+    ../gyexamples/plot_qextend_onnxruntime
diff --git a/_doc/sphinxdoc/source/tutorial_4_complex.rst b/_doc/sphinxdoc/source/tutorial/tutorial_4_complex.rst
similarity index 60%
rename from _doc/sphinxdoc/source/tutorial_4_complex.rst
rename to _doc/sphinxdoc/source/tutorial/tutorial_4_complex.rst
index 89c411e8..b14d114d 100644
--- a/_doc/sphinxdoc/source/tutorial_4_complex.rst
+++ b/_doc/sphinxdoc/source/tutorial/tutorial_4_complex.rst
@@ -7,5 +7,5 @@ Discrepencies may happen. Let's see some unexpected cases.
 .. toctree::
     :maxdepth: 1
 
-    auto_examples/plot_usparse_xgboost
-    auto_examples/plot_gexternal_lightgbm_reg
+    ../gyexamples/plot_usparse_xgboost
+    ../gyexamples/plot_gexternal_lightgbm_reg
diff --git a/_doc/sphinxdoc/source/tutorial/tutorial_6_training.rst b/_doc/sphinxdoc/source/tutorial/tutorial_6_training.rst
new file mode 100644
index 00000000..b2e6bf36
--- /dev/null
+++ b/_doc/sphinxdoc/source/tutorial/tutorial_6_training.rst
@@ -0,0 +1,10 @@
+
+Training
+========
+
+.. toctree::
+    :maxdepth: 1
+
+    ../gyexamples/plot_orttraining_linear_regression
+    ../gyexamples/plot_orttraining_linear_regression_gpu
+    ../gyexamples/plot_orttraining_mnist
diff --git a/_doc/sphinxdoc/source/tutorial_7_benchmark.rst b/_doc/sphinxdoc/source/tutorial/tutorial_7_benchmark.rst
similarity index 60%
rename from _doc/sphinxdoc/source/tutorial_7_benchmark.rst
rename to _doc/sphinxdoc/source/tutorial/tutorial_7_benchmark.rst
index 190e4e53..ea3c1b73 100644
--- a/_doc/sphinxdoc/source/tutorial_7_benchmark.rst
+++ b/_doc/sphinxdoc/source/tutorial/tutorial_7_benchmark.rst
@@ -5,4 +5,4 @@ Benchmarks
 .. toctree::
     :maxdepth: 1
 
-    auto_examples/plot_benchmark_op
+    ../gyexamples/plot_benchmark_op
diff --git a/_doc/sphinxdoc/source/tutorial_6_training.rst b/_doc/sphinxdoc/source/tutorial_6_training.rst
deleted file mode 100644
index b262d961..00000000
--- a/_doc/sphinxdoc/source/tutorial_6_training.rst
+++ /dev/null
@@ -1,10 +0,0 @@
-
-Training
-========
-
-.. toctree::
-    :maxdepth: 1
-
-    auto_examples/plot_orttraining_linear_regression
-    auto_examples/plot_orttraining_linear_regression_gpu
-    auto_examples/plot_orttraining_mnist
diff --git a/appveyor.yml b/appveyor.yml
index 160c8e35..04e7849f 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -11,6 +11,7 @@ init:
 install:
   - "%PYTHON%\\python -m pip install -r requirements.txt"
   - "%PYTHON%\\python -m pip install -r requirements-dev.txt"
+  - "%PYTHON%\\python -m pip install onnxruntime"
 build: off
 
 before_test:
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index fd2fd803..f01a3356 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -27,6 +27,8 @@ jobs:
     displayName: 'Install Requirements'
   - script: pip install -r requirements-dev.txt
     displayName: 'Install Requirements dev'
+  - script: pip install onnxruntime-training
+    displayName: 'Install onnxruntime-training'
   - script: |
       python -u setup.py build_ext --inplace
     displayName: 'Runs Unit Tests'
@@ -88,6 +90,8 @@ jobs:
     displayName: 'Install Requirements'
   - script: pip install -r requirements-dev.txt
     displayName: 'Install Requirements dev'
+  - script: pip install onnxruntime
+    displayName: 'Install onnxruntime'
   - script: |
           export MACOSX_DEPLOYMENT_TARGET=10.13
           python setup.py build_ext --inplace
diff --git a/onnxcustom/training/optimizers.py b/onnxcustom/training/optimizers.py
index 7970fccb..f34ab910 100644
--- a/onnxcustom/training/optimizers.py
+++ b/onnxcustom/training/optimizers.py
@@ -47,7 +47,7 @@ class OrtGradientOptimizer(BaseEstimator):
     Implements a simple :epkg:`Stochastic Gradient Descent`
     with :epkg:`onnxruntime-training`.
 
-    :param training_onnx: ONNX graph used to train
+    :param model_onnx: ONNX graph used to train
     :param weights_to_train: names of initializers to be optimized
     :param loss_output_name: name of the loss output
     :param max_iter: number of training iterations
diff --git a/onnxcustom/training/orttraining.py b/onnxcustom/training/orttraining.py
index b4a1b4e4..32eb87c2 100644
--- a/onnxcustom/training/orttraining.py
+++ b/onnxcustom/training/orttraining.py
@@ -8,7 +8,7 @@
 from onnx.numpy_helper import to_array
 
 
-def unique_name(existing_names, name, add=True):
+def _unique_name(existing_names, name, add=True):
     """
     Returns a name different from any name in *existing_names*.
 
@@ -65,8 +65,8 @@ def add_loss_output(onx, score_name='squared_error',
         shape.append(d.dim_value if d.dim_value > 0 else None)
 
     if score_name == 'squared_error':
-        diff_name = unique_name(existing_names, "loss_diff")
-        diff2_name = unique_name(existing_names, "loss_diff")
+        diff_name = _unique_name(existing_names, "loss_diff")
+        diff2_name = _unique_name(existing_names, "loss_diff")
         nodes = [make_node('Sub', [output_name, label_name], [diff_name]),
                  make_node('Mul', [diff_name, diff_name], [diff2_name]),
                  make_node('ReduceSum', [diff2_name], [loss_name])]
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 9513f45b..15443dec 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -14,7 +14,6 @@ mlprodict>=0.7
 nbsphinx
 onnxconverter-common
 onnxmltools
-onnxruntime>=1.9.0
 pillow
 py-spy
 pandas