diff --git a/README.rst b/README.rst
index d3fd35197..24b612b09 100644
--- a/README.rst
+++ b/README.rst
@@ -7,8 +7,8 @@
 mlprodict
 =========
 
-.. image:: https://travis-ci.org/sdpython/mlprodict.svg?branch=master
-    :target: https://travis-ci.org/sdpython/mlprodict
+.. image:: https://travis-ci.com/sdpython/mlprodict.svg?branch=master
+    :target: https://travis-ci.com/sdpython/mlprodict
     :alt: Build status
 
 .. image:: https://ci.appveyor.com/api/projects/status/g8chk1ufyk1m8uep?svg=true
diff --git a/_doc/sphinxdoc/source/conf.py b/_doc/sphinxdoc/source/conf.py
index 40a8ae86f..6c179487f 100644
--- a/_doc/sphinxdoc/source/conf.py
+++ b/_doc/sphinxdoc/source/conf.py
@@ -2,7 +2,7 @@
 import sys
 import os
 import alabaster
-from pyquickhelper.helpgen.default_conf import set_sphinx_variables, get_default_stylesheet
+from pyquickhelper.helpgen.default_conf import set_sphinx_variables
 from sklearn.experimental import enable_hist_gradient_boosting
 try:
     from mlprodict.onnx_conv import register_converters, register_rewritten_operators
@@ -47,10 +47,7 @@
     'generate_visual_graphs',
 ])
 
-html_context = {
-    'css_files': get_default_stylesheet([
-        '_static/my-styles.css', '_static/gallery.css']),
-}
+html_css_files = ['my-styles.css']
 
 html_logo = "phdoc_static/project_ico.png"
 
diff --git a/_doc/sphinxdoc/source/index.rst b/_doc/sphinxdoc/source/index.rst
index f86ab16c2..2e2a73a51 100644
--- a/_doc/sphinxdoc/source/index.rst
+++ b/_doc/sphinxdoc/source/index.rst
@@ -13,8 +13,8 @@ mlprodict
 :ref:`l-README`,
 :ref:`blog <ap-main-0>`
 
-.. image:: https://travis-ci.org/sdpython/mlprodict.svg?branch=master
-    :target: https://travis-ci.org/sdpython/mlprodict
+.. image:: https://travis-ci.com/sdpython/mlprodict.svg?branch=master
+    :target: https://travis-ci.com/sdpython/mlprodict
     :alt: Build status
 
 .. image:: https://ci.appveyor.com/api/projects/status/g8chk1ufyk1m8uep?svg=true
diff --git a/_unittests/ut_onnxrt/test_onnxrt_python_runtime_.py b/_unittests/ut_onnxrt/test_onnxrt_python_runtime_.py
index ec8abc9cb..2b476e605 100644
--- a/_unittests/ut_onnxrt/test_onnxrt_python_runtime_.py
+++ b/_unittests/ut_onnxrt/test_onnxrt_python_runtime_.py
@@ -47,7 +47,7 @@
     OnnxNeg, OnnxNot,
     OnnxOr,
     OnnxPad, OnnxPow,
-    OnnxQuantizeLinear,
+    OnnxQLinearConv, OnnxQuantizeLinear,
     OnnxReciprocal,
     OnnxReduceL1, OnnxReduceL2,
     OnnxReduceLogSumExp, OnnxReduceMax, OnnxReduceMean, OnnxReduceMin,
@@ -2252,6 +2252,53 @@ def test_onnxt_runtime_pad2(self):
     def test_onnxt_runtime_pow(self):
         self.common_test_onnxt_runtime_binary(OnnxPow, numpy.power)
 
+    @wraplog()
+    def test_onnxt_runtime_qlinear_conv(self):
+        x = numpy.array(
+            [[255, 174, 162, 25, 203, 168, 58],
+             [15, 59, 237, 95, 129, 0, 64],
+             [56, 242, 153, 221, 168, 12, 166],
+             [232, 178, 186, 195, 237, 162, 237],
+             [188, 39, 124, 77, 80, 102, 43],
+             [127, 230, 21, 83, 41, 40, 134],
+             [255, 154, 92, 141, 42, 148, 247], ],
+            dtype=numpy.uint8).reshape((1, 1, 7, 7))
+
+        x_scale = numpy.float32(0.00369204697)
+        x_zero_point = numpy.uint8(132)
+
+        w = numpy.array([0], dtype=numpy.uint8).reshape((1, 1, 1, 1))
+
+        w_scale = numpy.array([0.00172794575], dtype=numpy.float32)
+        w_zero_point = numpy.array([255], dtype=numpy.uint8)
+
+        y_scale = numpy.float32(0.00162681262)
+        y_zero_point = numpy.uint8(123)
+
+        output = numpy.array(
+            [[0, 81, 93, 230, 52, 87, 197],
+             [240, 196, 18, 160, 126, 255, 191],
+             [199, 13, 102, 34, 87, 243, 89],
+             [23, 77, 69, 60, 18, 93, 18],
+             [67, 216, 131, 178, 175, 153, 212],
+             [128, 25, 234, 172, 214, 215, 121],
+             [0, 101, 163, 114, 213, 107, 8], ],
+            dtype=numpy.uint8).reshape((1, 1, 7, 7))
+
+        node = OnnxQLinearConv('x', 'x_scale', 'x_zero_point', 'w',
+                               'w_scale', 'w_zero_point', 'y_scale',
+                               'y_zero_point', output_names=['y'],
+                               op_version=get_opset_number_from_onnx())
+        inputs = {'x': x, 'x_scale': x_scale, 'x_zero_point': x_zero_point,
+                  'w': w, 'w_scale': w_scale, 'w_zero_point': w_zero_point,
+                  'y_scale': y_scale, 'y_zero_point': y_zero_point}
+        model_def = node.to_onnx(inputs,
+                                 target_opset=get_opset_number_from_onnx())
+        oinf = OnnxInference(model_def)
+        got = oinf.run(inputs)['y']
+        self.assertEqualArray(output, got)
+        python_tested.append(OnnxQLinearConv)
+
     @wraplog()
     def test_onnxt_runtime_quantize_linear(self):
         X = numpy.array([[[[-162, 10], [-100, 232], [-20, -50]],
@@ -3507,5 +3554,5 @@ def test_make_constant(self):
 
 
 if __name__ == "__main__":
-    # TestOnnxrtPythonRuntime().test_onnxt_runtime_cast_in()
+    # TestOnnxrtPythonRuntime().test_onnxt_runtime_qlinear_conv()
     unittest.main()
diff --git a/bin/debug/debug.cpp b/bin/debug/debug.cpp
new file mode 100644
index 000000000..4ec7112a6
--- /dev/null
+++ b/bin/debug/debug.cpp
@@ -0,0 +1,59 @@
+// debug.cpp : Ce fichier contient la fonction 'main'. L'exécution du programme commence et se termine à cet endroit.
+//
+
+#include "op_qlinear_conv_.hpp"
+
+
+void test_qlinear_conv() {
+	QLinearConv<uint8_t> conv;
+
+	const std::string auto_pad = "NOTSET";
+	py_array_t<int64_t> dilations;
+	py_array_t<int64_t> kernel_shape;
+	py_array_t<int64_t> pads;
+	py_array_t<int64_t> strides;
+
+	conv.init(auto_pad, dilations, 1, kernel_shape, pads, strides);
+
+	py_array_t<uint8_t> x({
+		255, 174, 162, 25, 203, 168, 58,
+		15, 59, 237, 95, 129, 0, 64,
+		56, 242, 153, 221, 168, 12, 166,
+		232, 178, 186, 195, 237, 162, 237,
+		188, 39, 124, 77, 80, 102, 43,
+		127, 230, 21, 83, 41, 40, 134,
+		255, 154, 92, 141, 42, 148, 247 },
+		{ 1, 1, 7, 7 });
+
+	float x_scale = (float)0.00369204697;
+	uint8_t x_zero_point = 132;
+
+	py_array_t<uint8_t> w({ 0 }, { 1, 1, 1, 1 });
+
+	py_array_t<float> w_scale({ (float)0.00172794575 }, {});
+	uint8_t w_zero_point = 255;
+
+	float y_scale = (float)0.00162681262;
+	uint8_t y_zero_point = 123;
+
+	py_array_t<uint8_t> B;
+
+	py_array_t<uint8_t> output({
+			0, 81, 93, 230, 52, 87, 197,
+			240, 196, 18, 160, 126, 255, 191,
+			199, 13, 102, 34, 87, 243, 89,
+			23, 77, 69, 60, 18, 93, 18,
+			67, 216, 131, 178, 175, 153, 212,
+			128, 25, 234, 172, 214, 215, 121,
+			0, 101, 163, 114, 213, 107, 8 },
+		{ 1, 1, 7, 7 });
+	py_array_t < uint8_t> res = conv.compute(
+		x, x_scale, x_zero_point, w, w_scale, w_zero_point, y_scale, y_zero_point, B);
+	if (!res.equal(output))
+		throw std::runtime_error("failed");
+}
+
+
+int main() {
+	test_qlinear_conv();
+}
diff --git a/bin/debug/debug.sln b/bin/debug/debug.sln
new file mode 100644
index 000000000..e2ffbb37f
--- /dev/null
+++ b/bin/debug/debug.sln
@@ -0,0 +1,31 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio Version 16
+VisualStudioVersion = 16.0.31321.278
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "debug", "debug.vcxproj", "{714F6D8B-15D6-4E4E-940C-CC1AFA599EF1}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Debug|x86 = Debug|x86
+		Release|x64 = Release|x64
+		Release|x86 = Release|x86
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{714F6D8B-15D6-4E4E-940C-CC1AFA599EF1}.Debug|x64.ActiveCfg = Debug|x64
+		{714F6D8B-15D6-4E4E-940C-CC1AFA599EF1}.Debug|x64.Build.0 = Debug|x64
+		{714F6D8B-15D6-4E4E-940C-CC1AFA599EF1}.Debug|x86.ActiveCfg = Debug|Win32
+		{714F6D8B-15D6-4E4E-940C-CC1AFA599EF1}.Debug|x86.Build.0 = Debug|Win32
+		{714F6D8B-15D6-4E4E-940C-CC1AFA599EF1}.Release|x64.ActiveCfg = Release|x64
+		{714F6D8B-15D6-4E4E-940C-CC1AFA599EF1}.Release|x64.Build.0 = Release|x64
+		{714F6D8B-15D6-4E4E-940C-CC1AFA599EF1}.Release|x86.ActiveCfg = Release|Win32
+		{714F6D8B-15D6-4E4E-940C-CC1AFA599EF1}.Release|x86.Build.0 = Release|Win32
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {CC1B1913-0A0E-45AE-827F-C6D97913A40D}
+	EndGlobalSection
+EndGlobal
diff --git a/bin/debug/debug.vcxproj b/bin/debug/debug.vcxproj
new file mode 100644
index 000000000..aa5de2aad
--- /dev/null
+++ b/bin/debug/debug.vcxproj
@@ -0,0 +1,154 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>16.0</VCProjectVersion>
+    <Keyword>Win32Proj</Keyword>
+    <ProjectGuid>{714f6d8b-15d6-4e4e-940c-cc1afa599ef1}</ProjectGuid>
+    <RootNamespace>debug</RootNamespace>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v142</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v142</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v142</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v142</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions);SKIP_PYTHON</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+      <AdditionalIncludeDirectories>../../mlprodict/onnxrt/ops_cpu</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions);SKIP_PYTHON</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="debug.cpp" />
+    <ClCompile Include="../../mlprodict/onnxrt/ops_cpu/op_qlinear_conv_.cpp" />
+    <ClCompile Include="../../mlprodict/onnxrt/ops_cpu/op_qlinear_conv_.hpp" />
+    <ClCompile Include="../../mlprodict/onnxrt/ops_cpu/op_conv_matrices_.cpp" />
+    <ClCompile Include="../../mlprodict/onnxrt/ops_cpu/op_conv_matrices_.hpp" />
+    <ClCompile Include="../../mlprodict/onnxrt/ops_cpu/op_common_.cpp" />
+    <ClCompile Include="../../mlprodict/onnxrt/ops_cpu/op_common_.hpp" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/mlprodict/onnxrt/ops_cpu/_op_list.py b/mlprodict/onnxrt/ops_cpu/_op_list.py
index ad6dfa1be..e0dbb8209 100644
--- a/mlprodict/onnxrt/ops_cpu/_op_list.py
+++ b/mlprodict/onnxrt/ops_cpu/_op_list.py
@@ -78,6 +78,7 @@
 from .op_pad import Pad
 from .op_pow import Pow
 from .op_quantize_linear import QuantizeLinear
+from .op_qlinear_conv import QLinearConv
 from .op_reciprocal import Reciprocal
 from .op_reduce_log_sum_exp import ReduceLogSumExp
 from .op_reduce_l1 import ReduceL1
diff --git a/mlprodict/onnxrt/ops_cpu/op_common_.cpp b/mlprodict/onnxrt/ops_cpu/op_common_.cpp
index df53a5207..cc968637c 100644
--- a/mlprodict/onnxrt/ops_cpu/op_common_.cpp
+++ b/mlprodict/onnxrt/ops_cpu/op_common_.cpp
@@ -7,136 +7,132 @@
 #include <stdlib.h> // realloc
 
 
-POST_EVAL_TRANSFORM to_POST_EVAL_TRANSFORM(const std::string &value) {
-    if (value.compare("NONE") == 0) return POST_EVAL_TRANSFORM::NONE;
-    if (value.compare("LOGISTIC") == 0) return POST_EVAL_TRANSFORM::LOGISTIC;
-    if (value.compare("SOFTMAX") == 0) return POST_EVAL_TRANSFORM::SOFTMAX;
-    if (value.compare("SOFTMAX_ZERO") == 0) return POST_EVAL_TRANSFORM::SOFTMAX_ZERO;
-    if (value.compare("PROBIT") == 0) return POST_EVAL_TRANSFORM::PROBIT;
-    throw std::runtime_error(std::string("NODE_MODE '") + 
-                             value + 
-                             std::string("' is not defined."));
+POST_EVAL_TRANSFORM to_POST_EVAL_TRANSFORM(const std::string& value) {
+	if (value.compare("NONE") == 0) return POST_EVAL_TRANSFORM::NONE;
+	if (value.compare("LOGISTIC") == 0) return POST_EVAL_TRANSFORM::LOGISTIC;
+	if (value.compare("SOFTMAX") == 0) return POST_EVAL_TRANSFORM::SOFTMAX;
+	if (value.compare("SOFTMAX_ZERO") == 0) return POST_EVAL_TRANSFORM::SOFTMAX_ZERO;
+	if (value.compare("PROBIT") == 0) return POST_EVAL_TRANSFORM::PROBIT;
+	throw std::runtime_error(std::string("NODE_MODE '") +
+		value +
+		std::string("' is not defined."));
 }
 
 
-NODE_MODE to_NODE_MODE(const std::string &value) {
-    if (value.compare("BRANCH_LEQ") == 0) return NODE_MODE::BRANCH_LEQ;
-    if (value.compare("BRANCH_LT") == 0) return NODE_MODE::BRANCH_LT;
-    if (value.compare("BRANCH_GTE") == 0) return NODE_MODE::BRANCH_GTE;
-    if (value.compare("BRANCH_GT") == 0) return NODE_MODE::BRANCH_GT;
-    if (value.compare("BRANCH_EQ") == 0) return NODE_MODE::BRANCH_EQ;
-    if (value.compare("BRANCH_NEQ") == 0) return NODE_MODE::BRANCH_NEQ;
-    if (value.compare("LEAF") == 0) return NODE_MODE::LEAF;
-    throw std::runtime_error(std::string("NODE_MODE '") + 
-                             value + 
-                             std::string("' is not defined."));
+NODE_MODE to_NODE_MODE(const std::string& value) {
+	if (value.compare("BRANCH_LEQ") == 0) return NODE_MODE::BRANCH_LEQ;
+	if (value.compare("BRANCH_LT") == 0) return NODE_MODE::BRANCH_LT;
+	if (value.compare("BRANCH_GTE") == 0) return NODE_MODE::BRANCH_GTE;
+	if (value.compare("BRANCH_GT") == 0) return NODE_MODE::BRANCH_GT;
+	if (value.compare("BRANCH_EQ") == 0) return NODE_MODE::BRANCH_EQ;
+	if (value.compare("BRANCH_NEQ") == 0) return NODE_MODE::BRANCH_NEQ;
+	if (value.compare("LEAF") == 0) return NODE_MODE::LEAF;
+	throw std::runtime_error(std::string("NODE_MODE '") +
+		value +
+		std::string("' is not defined."));
 }
 
 
-const char * to_str(NODE_MODE mode) {
-    switch(mode) {
-        case NODE_MODE::BRANCH_LEQ: return "BRANCH_LEQ";
-        case NODE_MODE::BRANCH_LT: return "BRANCH_LT";
-        case NODE_MODE::BRANCH_GTE: return "BRANCH_GTE";
-        case NODE_MODE::BRANCH_GT: return "BRANCH_GT";
-        case NODE_MODE::BRANCH_EQ: return "BRANCH_EQ";
-        case NODE_MODE::BRANCH_NEQ: return "BRANCH_NEQ";
-        case NODE_MODE::LEAF: return "LEAF";
-        default: return "?";
-    }
+const char* to_str(NODE_MODE mode) {
+	switch (mode) {
+	case NODE_MODE::BRANCH_LEQ: return "BRANCH_LEQ";
+	case NODE_MODE::BRANCH_LT: return "BRANCH_LT";
+	case NODE_MODE::BRANCH_GTE: return "BRANCH_GTE";
+	case NODE_MODE::BRANCH_GT: return "BRANCH_GT";
+	case NODE_MODE::BRANCH_EQ: return "BRANCH_EQ";
+	case NODE_MODE::BRANCH_NEQ: return "BRANCH_NEQ";
+	case NODE_MODE::LEAF: return "LEAF";
+	default: return "?";
+	}
 }
 
 
-SVM_TYPE to_SVM_TYPE(const std::string &value) {
-    if (value.compare("SVM_LINEAR") == 0) return SVM_TYPE::SVM_LINEAR;
-    if (value.compare("SVM_SVC") == 0) return SVM_TYPE::SVM_SVC;
-    throw std::runtime_error(std::string("SVM_TYPE '") + 
-                             value + 
-                             std::string("' is not defined."));
+SVM_TYPE to_SVM_TYPE(const std::string& value) {
+	if (value.compare("SVM_LINEAR") == 0) return SVM_TYPE::SVM_LINEAR;
+	if (value.compare("SVM_SVC") == 0) return SVM_TYPE::SVM_SVC;
+	throw std::runtime_error(std::string("SVM_TYPE '") +
+		value +
+		std::string("' is not defined."));
 }
 
-KERNEL to_KERNEL(const std::string &value) {
-    if (value.compare("LINEAR") == 0) return KERNEL::LINEAR;
-    if (value.compare("POLY") == 0) return KERNEL::POLY;
-    if (value.compare("RBF") == 0) return KERNEL::RBF;
-    if (value.compare("SIGMOID") == 0) return KERNEL::SIGMOID;
-    throw std::runtime_error(std::string("KERNEL '") + 
-                             value + 
-                             std::string("' is not defined."));
+KERNEL to_KERNEL(const std::string& value) {
+	if (value.compare("LINEAR") == 0) return KERNEL::LINEAR;
+	if (value.compare("POLY") == 0) return KERNEL::POLY;
+	if (value.compare("RBF") == 0) return KERNEL::RBF;
+	if (value.compare("SIGMOID") == 0) return KERNEL::SIGMOID;
+	throw std::runtime_error(std::string("KERNEL '") +
+		value +
+		std::string("' is not defined."));
 }
 
 
 AGGREGATE_FUNCTION to_AGGREGATE_FUNCTION(const std::string& input) {
-    if (input == "AVERAGE") return AGGREGATE_FUNCTION::AVERAGE;
-    if (input == "SUM") return AGGREGATE_FUNCTION::SUM;
-    if (input == "MIN") return AGGREGATE_FUNCTION::MIN;
-    if (input == "MAX") return AGGREGATE_FUNCTION::MAX;
-    throw std::runtime_error(std::string("AGGREGATE_FUNCTION '") + 
-                             input + 
-                             std::string("' is not defined."));
+	if (input == "AVERAGE") return AGGREGATE_FUNCTION::AVERAGE;
+	if (input == "SUM") return AGGREGATE_FUNCTION::SUM;
+	if (input == "MIN") return AGGREGATE_FUNCTION::MIN;
+	if (input == "MAX") return AGGREGATE_FUNCTION::MAX;
+	throw std::runtime_error(std::string("AGGREGATE_FUNCTION '") +
+		input +
+		std::string("' is not defined."));
 }
 
 
 StorageOrder to_StorageOrder(const std::string& input) {
-    if (input == "UNKNOWN") return StorageOrder::UNKNOWN;
-    if (input == "NHWC") return StorageOrder::NHWC;
-    if (input == "NCHW") return StorageOrder::NCHW;
-    throw std::runtime_error(std::string("StorageOrder '") + 
-                             input + 
-                             std::string("' is not defined."));
+	if (input == "UNKNOWN") return StorageOrder::UNKNOWN;
+	if (input == "NHWC") return StorageOrder::NHWC;
+	if (input == "NCHW") return StorageOrder::NCHW;
+	throw std::runtime_error(std::string("StorageOrder '") +
+		input +
+		std::string("' is not defined."));
 }
 
 
-AutoPadType to_AutoPadType(const std::string &input) {
-    if (input == "NOTSET") return AutoPadType::NOTSET;
-    if (input == "VALID") return AutoPadType::VALID;
-    if (input == "SAME_UPPER") return AutoPadType::SAME_UPPER;
-    if (input == "SAME_LOWER") return AutoPadType::SAME_LOWER;
-    throw std::runtime_error(std::string("AutoPadType '") + 
-                             input + 
-                             std::string("' is not defined."));
+AutoPadType to_AutoPadType(const std::string& input) {
+	if (input == "NOTSET") return AutoPadType::NOTSET;
+	if (input == "VALID") return AutoPadType::VALID;
+	if (input == "SAME_UPPER") return AutoPadType::SAME_UPPER;
+	if (input == "SAME_LOWER") return AutoPadType::SAME_LOWER;
+	throw std::runtime_error(std::string("AutoPadType '") +
+		input +
+		std::string("' is not defined."));
 }
 
 
-void debug_print(const std::string &msg, int64_t iter, int64_t end) {
-    std::cout << msg.c_str() << ":" << iter << "/" << end << "\n";
+void debug_print(const std::string& msg, int64_t iter, int64_t end) {
+	std::cout << msg.c_str() << ":" << iter << "/" << end << "\n";
 }
 
 
-void debug_print(const std::string &msg, size_t i, size_t j, size_t k, float pa, float pb, float val) {
-    std::cout << msg.c_str() << ": (" << i << "," << j << "," << k << "): " << pa << "," << pb << " -> " << val << "\n";
+void debug_print(const std::string& msg, size_t i, size_t j, size_t k, float pa, float pb, float val) {
+	std::cout << msg.c_str() << ": (" << i << "," << j << "," << k << "): " << pa << "," << pb << " -> " << val << "\n";
 }
 
 
-void debug_print(const std::string &msg, size_t i, size_t j, size_t k, double pa, double pb, double val) {
-    std::cout << msg.c_str() << ": (" << i << "," << j << "," << k << "): " << pa << "," << pb << " -> " << val << "\n";
+void debug_print(const std::string& msg, size_t i, size_t j, size_t k, double pa, double pb, double val) {
+	std::cout << msg.c_str() << ": (" << i << "," << j << "," << k << "): " << pa << "," << pb << " -> " << val << "\n";
 }
 
 
 template <typename T>
 void debug_print_(const std::string& msg, T value) {
-    std::cout << msg.c_str() << ":" << value << "\n";
+	std::cout << msg.c_str() << ":" << value << "\n";
 }
 
 void debug_print(const std::string& msg, float value) {
-    debug_print_(msg.c_str(), value);
+	debug_print_(msg.c_str(), value);
 }
 
 
 void debug_print(const std::string& msg, double value) {
-    debug_print_(msg.c_str(), value);
+	debug_print_(msg.c_str(), value);
 }
 
 
 void debug_print(const std::string& msg, int64_t value) {
-    debug_print_(msg.c_str(), value);
+	debug_print_(msg.c_str(), value);
 }
 
 
 void debug_print(const std::string& msg, size_t value) {
-    debug_print_(msg.c_str(), value);
+	debug_print_(msg.c_str(), value);
 }
-
-
-
-
diff --git a/mlprodict/onnxrt/ops_cpu/op_common_.hpp b/mlprodict/onnxrt/ops_cpu/op_common_.hpp
index 869b0d6f0..972c8c758 100644
--- a/mlprodict/onnxrt/ops_cpu/op_common_.hpp
+++ b/mlprodict/onnxrt/ops_cpu/op_common_.hpp
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <algorithm>
 #include <cmath>
 #include <vector>
 #include <thread>
@@ -22,10 +23,10 @@ inline bool _isnan_(double x) { return ::isnan(x); }
 
 // See https://stackoverflow.com/questions/2249110/how-do-i-make-a-portable-isnan-isinf-function
 inline bool _isnan_(double x) {
-    union { uint64_t u; double f; } ieee754;
-    ieee754.f = x;
-    return ( (unsigned)(ieee754.u >> 32) & 0x7fffffff ) +
-           ( (unsigned)ieee754.u != 0 ) > 0x7ff00000;
+	union { uint64_t u; double f; } ieee754;
+	ieee754.f = x;
+	return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) +
+		((unsigned)ieee754.u != 0) > 0x7ff00000;
 }
 
 inline bool _isnan_(float x) { return _isnan_((double)x); }
@@ -34,111 +35,111 @@ inline bool _isnan_(float x) { return _isnan_((double)x); }
 
 
 enum class POST_EVAL_TRANSFORM {
-  NONE,
-  LOGISTIC,
-  SOFTMAX,
-  SOFTMAX_ZERO,
-  PROBIT
+	NONE,
+	LOGISTIC,
+	SOFTMAX,
+	SOFTMAX_ZERO,
+	PROBIT
 };
 
-POST_EVAL_TRANSFORM to_POST_EVAL_TRANSFORM(const std::string &value);
+POST_EVAL_TRANSFORM to_POST_EVAL_TRANSFORM(const std::string& value);
 
 
 enum class NODE_MODE {
-  BRANCH_LEQ,
-  BRANCH_LT,
-  BRANCH_GTE,
-  BRANCH_GT,
-  BRANCH_EQ,
-  BRANCH_NEQ,
-  LEAF
+	BRANCH_LEQ,
+	BRANCH_LT,
+	BRANCH_GTE,
+	BRANCH_GT,
+	BRANCH_EQ,
+	BRANCH_NEQ,
+	LEAF
 };
 
-NODE_MODE to_NODE_MODE(const std::string &value);
+NODE_MODE to_NODE_MODE(const std::string& value);
 
-const char * to_str(NODE_MODE mode);
+const char* to_str(NODE_MODE mode);
 
 
 enum class AGGREGATE_FUNCTION {
-  AVERAGE,
-  SUM,
-  MIN,
-  MAX
+	AVERAGE,
+	SUM,
+	MIN,
+	MAX
 };
 
 AGGREGATE_FUNCTION to_AGGREGATE_FUNCTION(const std::string& input);
 
 
 enum class SVM_TYPE {
-  SVM_LINEAR,
-  SVM_SVC
+	SVM_LINEAR,
+	SVM_SVC
 };
 
-SVM_TYPE to_SVM_TYPE(const std::string &value);
+SVM_TYPE to_SVM_TYPE(const std::string& value);
 
 
 enum KERNEL {
-  LINEAR,
-  POLY,
-  RBF,
-  SIGMOID
+	LINEAR,
+	POLY,
+	RBF,
+	SIGMOID
 };
 
-KERNEL to_KERNEL(const std::string &value);
+KERNEL to_KERNEL(const std::string& value);
 
 
 enum StorageOrder {
-  UNKNOWN = 0,
-  NHWC = 1,
-  NCHW = 2,
+	UNKNOWN = 0,
+	NHWC = 1,
+	NCHW = 2,
 };
 
-StorageOrder to_StorageOrder(const std::string &value);
+StorageOrder to_StorageOrder(const std::string& value);
 
 
 enum class AutoPadType {
-  NOTSET = 0,
-  VALID = 1,
-  SAME_UPPER = 2,
-  SAME_LOWER = 3,
+	NOTSET = 0,
+	VALID = 1,
+	SAME_UPPER = 2,
+	SAME_LOWER = 3,
 };
 
-AutoPadType to_AutoPadType(const std::string &value);
+AutoPadType to_AutoPadType(const std::string& value);
 
 
 
 static inline float ErfInv(float x) {
-  float sgn = x < 0 ? -1.0f : 1.0f;
-  x = (1 - x) * (1 + x);
-  float log = std::log(x);
-  float v = 2 / (3.14159f * 0.147f) + 0.5f * log;
-  float v2 = 1 / (0.147f) * log;
-  float v3 = -v + std::sqrt(v * v - v2);
-  x = sgn * std::sqrt(v3);
-  return x;
+	float sgn = x < 0 ? -1.0f : 1.0f;
+	x = (1 - x) * (1 + x);
+	float log = std::log(x);
+	float v = 2 / (3.14159f * 0.147f) + 0.5f * log;
+	float v2 = 1 / (0.147f) * log;
+	float v3 = -v + std::sqrt(v * v - v2);
+	x = sgn * std::sqrt(v3);
+	return x;
 }
 
 
 static inline double ErfInv(double x) {
-  double sgn = x < 0 ? -1.0 : 1.0;
-  x = (1 - x) * (1 + x);
-  double log = std::log(x);
-  double v = 2. / (3.14159f * 0.147f) + 0.5f * log;
-  double v2 = 1. / (0.147f) * log;
-  double v3 = std::sqrt(v * v - v2) - v;
-  return sgn * std::sqrt(v3);
+	double sgn = x < 0 ? -1.0 : 1.0;
+	x = (1 - x) * (1 + x);
+	double log = std::log(x);
+	double v = 2. / (3.14159f * 0.147f) + 0.5f * log;
+	double v2 = 1. / (0.147f) * log;
+	double v3 = std::sqrt(v * v - v2) - v;
+	return sgn * std::sqrt(v3);
 }
 
 
 static inline float ComputeLogistic(float val) {
-  float v = 1 / (1 + std::exp(-std::abs(val)));
-  return (val < 0) ? (1 - v) : v;
+	float v = 1 / (1 + std::exp(-std::abs(val)));
+	return (val < 0) ? (1 - v) : v;
 }
 
 
 static inline double ComputeLogistic(double val) {
-  double v = 1. / (1. + std::exp(-std::abs(val)));
-  return (val < 0) ? (1. - v) : v;
+	double v = 1. / (1. + std::exp(-std::abs(val)));
+	return (val < 0) ? (1. - v) : v;
 }
 
 
@@ -147,180 +148,181 @@ static const float ml_sqrt2 = 1.41421356f;
 
 template<class NTYPE>
 static inline NTYPE ComputeProbit(NTYPE val) {
-  return ml_sqrt2 * ErfInv(val * 2 - 1);
+	return ml_sqrt2 * ErfInv(val * 2 - 1);
 }
 
 
 template<class NTYPE>
 static inline NTYPE sigmoid_probability(NTYPE score, NTYPE proba, NTYPE probb) {
-  NTYPE val = score * proba + probb;
-  return 1 - ComputeLogistic(val);  // ref: https://github.com/arnaudsj/libsvm/blob/eaaefac5ebd32d0e07902e1ae740e038eaaf0826/svm.cpp#L1818
+	NTYPE val = score * proba + probb;
+	return 1 - ComputeLogistic(val);  // ref: https://github.com/arnaudsj/libsvm/blob/eaaefac5ebd32d0e07902e1ae740e038eaaf0826/svm.cpp#L1818
 }
 
 
 template<typename NTYPE>
 void ComputeSoftmax(NTYPE* begin, NTYPE* end) {
-    NTYPE v_max = -std::numeric_limits<NTYPE>::max();
-    NTYPE* it;
-    for (it = begin; it != end; ++it) {
-        if (*it > v_max)
-          v_max = *it;
-    }
-    NTYPE this_sum = (NTYPE)0.;
-    for (it = begin; it != end; ++it) {
-        *it = std::exp(*it - v_max);
-        this_sum += *it;
-    }
-    for (it = begin; it != end; ++it)
-        *it /= this_sum;
+	NTYPE v_max = -std::numeric_limits<NTYPE>::max();
+	NTYPE* it;
+	for (it = begin; it != end; ++it) {
+		if (*it > v_max)
+			v_max = *it;
+	}
+	NTYPE this_sum = (NTYPE)0.;
+	for (it = begin; it != end; ++it) {
+		*it = std::exp(*it - v_max);
+		this_sum += *it;
+	}
+	for (it = begin; it != end; ++it)
+		*it /= this_sum;
 }
 
 
 template<typename NTYPE>
 void ComputeSoftmax(std::vector<NTYPE>& values) {
-    ComputeSoftmax(values.data(), values.data() + values.size());
+	ComputeSoftmax(values.data(), values.data() + values.size());
 }
 
 
 template<typename NTYPE>
 void ComputeSoftmaxZero(NTYPE* begin, NTYPE* end) {
-    NTYPE v_max = -std::numeric_limits<NTYPE>::max();
-    NTYPE* it;
-    for (it = begin; it != end; ++it) {
-        if (*it > v_max)
-          v_max = *it;
-    }
-    NTYPE exp_neg_v_max = std::exp(-v_max);
-    NTYPE this_sum = (NTYPE)0;
-    for (it = begin; it != end; ++it) {
-        if (*it > 0.0000001f || *it < -0.0000001f) {
-            *it = std::exp(*it - v_max);
-            this_sum += *it;
-        } else {
-            *it *= exp_neg_v_max;
-        }
-    }
-    for (it = begin; it != end; ++it)
-        *it /= this_sum;
+	NTYPE v_max = -std::numeric_limits<NTYPE>::max();
+	NTYPE* it;
+	for (it = begin; it != end; ++it) {
+		if (*it > v_max)
+			v_max = *it;
+	}
+	NTYPE exp_neg_v_max = std::exp(-v_max);
+	NTYPE this_sum = (NTYPE)0;
+	for (it = begin; it != end; ++it) {
+		if (*it > 0.0000001f || *it < -0.0000001f) {
+			*it = std::exp(*it - v_max);
+			this_sum += *it;
+		}
+		else {
+			*it *= exp_neg_v_max;
+		}
+	}
+	for (it = begin; it != end; ++it)
+		*it /= this_sum;
 }
 
 
 template<typename NTYPE>
 void ComputeSoftmaxZero(std::vector<NTYPE>& values) {
-    ComputeSoftmaxZero(values.data(), values.data() + values.size());
+	ComputeSoftmaxZero(values.data(), values.data() + values.size());
 }
 
 
 template<class NTYPE>
 size_t write_scores(std::vector<NTYPE>& scores, POST_EVAL_TRANSFORM post_transform,
-                  NTYPE* Z, int add_second_class) {
-    if ((scores.size() == 1) && add_second_class) {
-        scores.push_back(0);
-        return write_scores(1, scores.data(), post_transform, Z, add_second_class);
-    }
-    return write_scores(scores.size(), scores.data(), post_transform, Z, add_second_class);
+	NTYPE* Z, int add_second_class) {
+	if ((scores.size() == 1) && add_second_class) {
+		scores.push_back(0);
+		return write_scores(1, scores.data(), post_transform, Z, add_second_class);
+	}
+	return write_scores(scores.size(), scores.data(), post_transform, Z, add_second_class);
 }
 
 
 template<class NTYPE>
 size_t write_scores(size_t n_classes, NTYPE* scores, POST_EVAL_TRANSFORM post_transform,
-                  NTYPE* Z, int add_second_class) {
-    if (n_classes >= 2) {
-        NTYPE * end = scores + n_classes;
-        switch (post_transform) {
-            case POST_EVAL_TRANSFORM::PROBIT:
-                for(auto it = scores; it != end; ++it, ++Z)
-                    *Z = ComputeProbit(*it);
-                break;
-            case POST_EVAL_TRANSFORM::LOGISTIC:
-                for(auto it = scores; it != end; ++it, ++Z)
-                    *Z = ComputeLogistic(*it);
-                break;
-            case POST_EVAL_TRANSFORM::SOFTMAX:
-                ComputeSoftmax(scores, end);
-                memcpy(Z, scores, n_classes * sizeof(NTYPE));
-                break;
-            case POST_EVAL_TRANSFORM::SOFTMAX_ZERO:
-                ComputeSoftmaxZero(scores, end);
-                memcpy(Z, scores, n_classes * sizeof(NTYPE));
-                break;
-            default:
-            case POST_EVAL_TRANSFORM::NONE:
-                memcpy(Z, scores, n_classes * sizeof(NTYPE));
-                break;
-        }
-    }
-    else if (n_classes == 1) {  //binary case
-        if (post_transform == POST_EVAL_TRANSFORM::PROBIT) {
-            scores[0] = ComputeProbit(scores[0]);
-            *Z = scores[0];
-        }
-        else {
-            switch (add_second_class) {
-                case 0:  //0=all positive weights, winning class is positive
-                    scores[1] = scores[0];
-                    scores[0] = 1.f - scores[0];  //put opposite score in positive slot
-                    *Z = scores[0];
-                    *(Z+1) = scores[1];
-                    ++n_classes;
-                    break;
-                case 1:  //1 = all positive weights, winning class is negative
-                    scores[1] = scores[0];
-                    scores[0] = 1.f - scores[0];  //put opposite score in positive slot
-                    *Z = scores[0];
-                    *(Z+1) = scores[1];
-                    ++n_classes;
-                    break;
-                case 2:
-                case 3:  //2 = mixed weights, winning class is positive
-                    if (post_transform == POST_EVAL_TRANSFORM::LOGISTIC) {
-                        scores[1] = ComputeLogistic(scores[0]);  //ml_logit(scores[k]);
-                        scores[0] = ComputeLogistic(-scores[0]);
-                    }
-                    else {
-                        scores[1] = scores[0];
-                        scores[0] = -scores[0];
-                    }
-                    *Z = scores[0];
-                    *(Z+1) = scores[1];
-                    ++n_classes;
-                    break;
-                default:
-                    *Z = scores[0];
-                    break;
-            }
-        }
-    }
-    return n_classes;
+	NTYPE* Z, int add_second_class) {
+	if (n_classes >= 2) {
+		NTYPE* end = scores + n_classes;
+		switch (post_transform) {
+		case POST_EVAL_TRANSFORM::PROBIT:
+			for (auto it = scores; it != end; ++it, ++Z)
+				*Z = ComputeProbit(*it);
+			break;
+		case POST_EVAL_TRANSFORM::LOGISTIC:
+			for (auto it = scores; it != end; ++it, ++Z)
+				*Z = ComputeLogistic(*it);
+			break;
+		case POST_EVAL_TRANSFORM::SOFTMAX:
+			ComputeSoftmax(scores, end);
+			memcpy(Z, scores, n_classes * sizeof(NTYPE));
+			break;
+		case POST_EVAL_TRANSFORM::SOFTMAX_ZERO:
+			ComputeSoftmaxZero(scores, end);
+			memcpy(Z, scores, n_classes * sizeof(NTYPE));
+			break;
+		default:
+		case POST_EVAL_TRANSFORM::NONE:
+			memcpy(Z, scores, n_classes * sizeof(NTYPE));
+			break;
+		}
+	}
+	else if (n_classes == 1) {  //binary case
+		if (post_transform == POST_EVAL_TRANSFORM::PROBIT) {
+			scores[0] = ComputeProbit(scores[0]);
+			*Z = scores[0];
+		}
+		else {
+			switch (add_second_class) {
+			case 0:  //0=all positive weights, winning class is positive
+				scores[1] = scores[0];
+				scores[0] = 1.f - scores[0];  //put opposite score in positive slot
+				*Z = scores[0];
+				*(Z + 1) = scores[1];
+				++n_classes;
+				break;
+			case 1:  //1 = all positive weights, winning class is negative
+				scores[1] = scores[0];
+				scores[0] = 1.f - scores[0];  //put opposite score in positive slot
+				*Z = scores[0];
+				*(Z + 1) = scores[1];
+				++n_classes;
+				break;
+			case 2:
+			case 3:  //2 = mixed weights, winning class is positive
+				if (post_transform == POST_EVAL_TRANSFORM::LOGISTIC) {
+					scores[1] = ComputeLogistic(scores[0]);  //ml_logit(scores[k]);
+					scores[0] = ComputeLogistic(-scores[0]);
+				}
+				else {
+					scores[1] = scores[0];
+					scores[0] = -scores[0];
+				}
+				*Z = scores[0];
+				*(Z + 1) = scores[1];
+				++n_classes;
+				break;
+			default:
+				*Z = scores[0];
+				break;
+			}
+		}
+	}
+	return n_classes;
 }
 
 
 template<class NTYPE>
 size_t write_scores2(NTYPE* scores, POST_EVAL_TRANSFORM post_transform,
-                   NTYPE* Z, int add_second_class) {
-    switch (post_transform) {
-        case POST_EVAL_TRANSFORM::PROBIT:
-            Z[0] = ComputeProbit(scores[0]);
-            Z[1] = ComputeProbit(scores[1]);
-            break;
-        case POST_EVAL_TRANSFORM::LOGISTIC:
-            Z[0] = ComputeLogistic(scores[0]);
-            Z[1] = ComputeLogistic(scores[1]);
-            break;
-        case POST_EVAL_TRANSFORM::SOFTMAX:
-            ComputeSoftmax(scores, scores + 2);
-            memcpy(Z, scores, 2 * sizeof(NTYPE));
-            break;
-        case POST_EVAL_TRANSFORM::SOFTMAX_ZERO:
-            ComputeSoftmaxZero(scores, scores + 2);
-            memcpy(Z, scores, 2 * sizeof(NTYPE));
-            break;
-        default:
-        case POST_EVAL_TRANSFORM::NONE:
-            memcpy(Z, scores, 2 * sizeof(NTYPE));
-            break;
-    }
-    return 2;
+	NTYPE* Z, int add_second_class) {
+	switch (post_transform) {
+	case POST_EVAL_TRANSFORM::PROBIT:
+		Z[0] = ComputeProbit(scores[0]);
+		Z[1] = ComputeProbit(scores[1]);
+		break;
+	case POST_EVAL_TRANSFORM::LOGISTIC:
+		Z[0] = ComputeLogistic(scores[0]);
+		Z[1] = ComputeLogistic(scores[1]);
+		break;
+	case POST_EVAL_TRANSFORM::SOFTMAX:
+		ComputeSoftmax(scores, scores + 2);
+		memcpy(Z, scores, 2 * sizeof(NTYPE));
+		break;
+	case POST_EVAL_TRANSFORM::SOFTMAX_ZERO:
+		ComputeSoftmaxZero(scores, scores + 2);
+		memcpy(Z, scores, 2 * sizeof(NTYPE));
+		break;
+	default:
+	case POST_EVAL_TRANSFORM::NONE:
+		memcpy(Z, scores, 2 * sizeof(NTYPE));
+		break;
+	}
+	return 2;
 }
 
 
@@ -344,74 +346,74 @@ size_t write_scores2(NTYPE* scores, POST_EVAL_TRANSFORM post_transform,
 
 template<class NTYPE>
 NTYPE flattened_dimension(const std::vector<NTYPE>& values) {
-    NTYPE r = 1;
-    for(auto it = values.begin(); it != values.end(); ++it)
-        r *= *it;
-    return r;
+	NTYPE r = 1;
+	for (auto it = values.begin(); it != values.end(); ++it)
+		r *= *it;
+	return r;
 }
 
 
 template<class NTYPE>
 NTYPE flattened_dimension(const std::vector<NTYPE>& values, int64_t first) {
-    NTYPE r = 1;
-    auto end = values.begin() + first;
-    for(auto it = values.begin(); it != end; ++it)
-        r *= *it;
-    return r;
+	NTYPE r = 1;
+	auto end = values.begin() + first;
+	for (auto it = values.begin(); it != end; ++it)
+		r *= *it;
+	return r;
 }
 
 
 template<class DIMTYPE, class NTYPE>
-void shape2strides(const std::vector<DIMTYPE>& shape, 
-                   std::vector<DIMTYPE>& strides, NTYPE cst) {
-    strides.resize(shape.size());
-    strides[strides.size()-1] = sizeof(NTYPE);
-    for(ssize_t i = strides.size()-2; i >= 0; --i)
-        strides[i] = strides[i+1] * shape[i+1];
+void shape2strides(const std::vector<DIMTYPE>& shape,
+	std::vector<DIMTYPE>& strides, NTYPE cst) {
+	strides.resize(shape.size());
+	strides[strides.size() - 1] = sizeof(NTYPE);
+	for (int64_t i = (int64_t)strides.size() - 2; i >= 0; --i)
+		strides[i] = strides[i + 1] * shape[i + 1];
 }
 
 
 template<class DIMTYPE>
 DIMTYPE SizeFromDimension(const std::vector<DIMTYPE>& shape, size_t start, size_t end) {
-    DIMTYPE size = 1;
-    for (size_t i = start; i < end; i++) {
-        if (shape[i] < 0)
-            return -1;
-        size *= shape[i];
-    }
-    return size;
+	DIMTYPE size = 1;
+	for (size_t i = start; i < end; i++) {
+		if (shape[i] < 0)
+			return -1;
+		size *= shape[i];
+	}
+	return size;
 }
 
 
 template <typename T, T b>
 constexpr T roundUpPow2(T a) {
-    return (a + (b - 1)) & (~(b - 1));
+	return (a + (b - 1)) & (~(b - 1));
 }
 
 
 inline int64_t HandleNegativeAxis(int64_t axis, int64_t tensor_rank) {
-    return axis < 0 ? axis + tensor_rank : axis;
+	return axis < 0 ? axis + tensor_rank : axis;
 }
 
 
 template <typename T>
 void debug_print(const std::string& msg, size_t size, const T* value) {
-    std::cout << msg << " - size:" << size << " :: ";
-    size_t i = size > 10 ? 10 : size;
-    for (size_t j = 0; j < i; ++j)
-        std::cout << value[j] << " ";
-    std::cout << "\n";
+	std::cout << msg << " - size:" << size << " :: ";
+	size_t i = size > 10 ? 10 : size;
+	for (size_t j = 0; j < i; ++j)
+		std::cout << value[j] << " ";
+	std::cout << "\n";
 }
 
 
 template <typename T>
 void debug_print(const std::string& msg, const std::vector<T>& value) {
-    auto size = value.size();
-    std::cout << msg << " - size:" << size << " :: ";
-    size_t i = size > 10 ? 10 : size;
-    for (size_t j = 0; j < i; ++j)
-        std::cout << value[j] << " ";
-    std::cout << "\n";
+	auto size = value.size();
+	std::cout << msg << " - size:" << size << " :: ";
+	size_t i = size > 10 ? 10 : size;
+	for (size_t j = 0; j < i; ++j)
+		std::cout << value[j] << " ";
+	std::cout << "\n";
 }
 
 
@@ -426,18 +428,18 @@ void debug_print(const std::string& msg, size_t i, size_t j, size_t k, double pa
 
 template <typename T>
 inline void MakeStringInternal(std::ostringstream& ss, const T& t) noexcept {
-    ss << t;
+	ss << t;
 }
 
 template <typename T, typename... Args>
 inline void MakeStringInternal(std::ostringstream& ss, const T& t, const Args&... args) noexcept {
-    MakeStringInternal(ss, t);
-    MakeStringInternal(ss, args...);
+	MakeStringInternal(ss, t);
+	MakeStringInternal(ss, args...);
 }
 
 template <typename... Args>
 inline std::string MakeString(const Args&... args) {
-    std::ostringstream ss;
-    MakeStringInternal(ss, args...);
-    return std::string(ss.str());
+	std::ostringstream ss;
+	MakeStringInternal(ss, args...);
+	return std::string(ss.str());
 }
diff --git a/mlprodict/onnxrt/ops_cpu/op_conv_.cpp b/mlprodict/onnxrt/ops_cpu/op_conv_.cpp
index 170c235d1..e68a455bb 100644
--- a/mlprodict/onnxrt/ops_cpu/op_conv_.cpp
+++ b/mlprodict/onnxrt/ops_cpu/op_conv_.cpp
@@ -23,35 +23,17 @@ namespace py = pybind11;
 
 
 template <typename T>
-class Conv : ConvPoolCommon<T> {
-    
-    private:
-        
-        AutoPadType auto_pad_;    
-        std::vector<int64_t> dilations_;
-        int64_t group_;
-        std::vector<int64_t> kernel_shape_;
-        std::vector<int64_t> pads_;
-        std::vector<int64_t> strides_;
+class Conv : public ConvPoolCommon {
     
     public:
 
         Conv();
-        void init(const std::string &auto_pad,
-                  py::array_t<int64_t> dilations,
-                  int64_t group,
-                  py::array_t<int64_t> kernel_shape,
-                  py::array_t<int64_t> pads,
-                  py::array_t<int64_t> strides);
 
         py::array_t<T> compute(py::array_t<T, py::array::c_style | py::array::forcecast> X,
                                py::array_t<T, py::array::c_style | py::array::forcecast> W,
                                py::array_t<T, py::array::c_style | py::array::forcecast> B) const;
     
-    private:
-
-        void compute_kernel_shape(const std::vector<int64_t>& weight_shape,
-                                  std::vector<int64_t>& kernel_shape) const;
+    protected:
 
         void compute_gil_free(py::array_t<T> X, py::array_t<T> W,
                               py::array_t<T> B, py::array_t<T>& Y,
@@ -64,57 +46,10 @@ class Conv : ConvPoolCommon<T> {
                               const std::vector<int64_t>& x_dims,
                               const std::vector<int64_t>& y_dims,
                               const std::vector<int64_t>& w_dims) const;
-
-        void infer_output_shape(const std::vector<int64_t>& input_shape,
-                      const std::vector<int64_t>& kernel_shape,
-                      const std::vector<int64_t>& strides_p,
-                      const std::vector<int64_t>& dilations_p,
-                      std::vector<int64_t>& pads_p,
-                      std::vector<int64_t>& output_shape,
-                      bool ForceSymmetricAutoPadding = false) const;
 };
 
 template<typename T>
-Conv<T>::Conv() : ConvPoolCommon<T>() {
-}
-
-
-template<typename T>
-void Conv<T>::init(
-            const std::string &auto_pad,
-            py::array_t<int64_t> dilations,
-            int64_t group,
-            py::array_t<int64_t> kernel_shape,
-            py::array_t<int64_t> pads,
-            py::array_t<int64_t> strides
-    ) {
-    auto_pad_ = to_AutoPadType(auto_pad);
-    array2vector(dilations_, dilations, int64_t);
-    group_ = group;        
-    array2vector(dilations_, dilations, int64_t);
-    array2vector(pads_, pads, int64_t);
-    array2vector(strides_, strides, int64_t);
-}
-
-
-template<typename T>
-void Conv<T>::compute_kernel_shape(const std::vector<int64_t>& weight_shape,
-                                   std::vector<int64_t>& kernel_shape) const {
-    if (kernel_shape_.size() > 0) {
-        kernel_shape = kernel_shape_;
-        if (kernel_shape.size() + 2 != weight_shape.size())
-            throw std::runtime_error(
-                "kernel_shape num_dims is not compatible with W num_dims (1).");
-
-        for (size_t i = 0; i < kernel_shape.size(); ++i)
-            if (kernel_shape[i] != weight_shape[i + 2])
-                throw std::runtime_error(
-                    "kernel_shape num_dims is not compatible with W num_dims (2).");
-    }
-    else {
-        auto& weight_dims = weight_shape;
-        kernel_shape = std::vector<int64_t>(weight_dims.begin() + 2, weight_dims.end());
-    }
+Conv<T>::Conv() : ConvPoolCommon() {
 }
 
 
@@ -150,7 +85,7 @@ py::array_t<T> Conv<T>::compute(py::array_t<T, py::array::c_style | py::array::f
     std::vector<int64_t> y_dims;
     y_dims.insert(y_dims.begin(), {N, M});
     std::vector<int64_t> input_shape(x_dims.begin() + 2, x_dims.end());
-    infer_output_shape(input_shape, kernel_shape, strides, dilations, pads, y_dims);
+    infer_output_shape(input_shape, kernel_shape, strides, dilations, pads, y_dims, false);
     std::vector<int64_t> output_shape(y_dims.begin() + 2, y_dims.end());
 
     // py::array::ShapeContainer shape(y_dims);
@@ -167,38 +102,6 @@ py::array_t<T> Conv<T>::compute(py::array_t<T, py::array::c_style | py::array::f
 }
 
 
-template<typename T>
-void Conv<T>::infer_output_shape(
-                    const std::vector<int64_t>& input_shape,
-                    const std::vector<int64_t>& kernel_shape,
-                    const std::vector<int64_t>& strides_p,
-                    const std::vector<int64_t>& dilations_p,
-                    std::vector<int64_t>& pads_p,
-                    std::vector<int64_t>& output_shape,
-                    bool ForceSymmetricAutoPadding) const {
-
-    size_t rank = input_shape.size();
-    int64_t dim_size;
-
-    for (size_t dim = 0; dim < rank; ++dim) {
-        if (dim >= strides_p.size() || dim >= kernel_shape.size() ||
-                dim >= dilations_p.size() || dim >= pads_p.size() ||
-                rank + dim >= pads_p.size())
-            throw std::runtime_error("Failure in infer_output_shape.");
-
-        dim_size = 0;
-        ComputePadAndOutputShape<T>(
-            input_shape[dim], strides_p[dim], kernel_shape[dim],
-            dilations_p[dim], auto_pad_, &pads_p.at(dim),
-            &pads_p.at(input_shape.size() + dim),
-            &dim_size, ForceSymmetricAutoPadding);
-        if (dim_size <= 0)
-            throw std::runtime_error("Invalid argument in infer_output_shape.");
-        output_shape.push_back(dim_size);
-    }
-}
-
-
 template<typename T>
 void Conv<T>::compute_gil_free(
         py::array_t<T> X, py::array_t<T> W, py::array_t<T> B, py::array_t<T>& Y,
diff --git a/mlprodict/onnxrt/ops_cpu/op_conv_matrices_.cpp b/mlprodict/onnxrt/ops_cpu/op_conv_matrices_.cpp
new file mode 100644
index 000000000..a21168c94
--- /dev/null
+++ b/mlprodict/onnxrt/ops_cpu/op_conv_matrices_.cpp
@@ -0,0 +1,120 @@
+#include "op_conv_matrices_.hpp"
+
+
+void ComputePadAndOutputShape(
+	int64_t in_dim, int64_t stride,
+	int64_t kernel, int64_t dilation,
+	AutoPadType pad_type, int64_t* pad_head,
+	int64_t* pad_tail, int64_t* out_dim,
+	bool ForceSymmetricAutoPadding) {
+
+	const int64_t dkernel = dilation * (kernel - 1) + 1;
+
+	if (pad_type == AutoPadType::NOTSET) {
+		*out_dim = static_cast<int64_t>(static_cast<float>(
+			in_dim + *pad_head + *pad_tail - dkernel) / stride + 1);
+	}
+	else {
+		switch (pad_type) {
+		case AutoPadType::VALID:
+			*pad_head = 0;
+			*pad_tail = 0;
+			*out_dim = (in_dim - dkernel) / stride + 1;
+			break;
+		case AutoPadType::SAME_UPPER:
+		case AutoPadType::SAME_LOWER: {
+			if (dilation != 1)
+				throw std::runtime_error(
+					"Dilation not supported for AutoPadType::SAME_UPPER or AutoPadType::SAME_LOWER.");
+			int64_t legacy_target_size = (in_dim + stride - 1) / stride;
+			int64_t pad_needed = (legacy_target_size - 1) * stride + kernel - in_dim;
+			*out_dim = (in_dim + pad_needed - dkernel) / stride + 1;
+
+			// make sure padding is symmetric
+			if (ForceSymmetricAutoPadding)
+				pad_needed = roundUpPow2<int64_t, 2>(pad_needed);
+
+			*pad_head = (pad_type == AutoPadType::SAME_LOWER)
+				? (pad_needed + 1) / 2
+				: pad_needed / 2;
+			*pad_tail = pad_needed - *pad_head;
+		} break;
+		default:
+			throw std::runtime_error("Invalid argument in ComputePadAndOutputShape.");
+		}
+	}
+}
+
+
+void ConvPoolCommonShape::init(
+	const std::string& auto_pad,
+	py_array_t<int64_t> kernel_shape) {
+	auto_pad_ = to_AutoPadType(auto_pad);
+	array2vector(kernel_shape_, kernel_shape, int64_t);
+}
+
+
+void ConvPoolCommonShape::compute_kernel_shape(
+	const std::vector<int64_t>& weight_shape,
+	std::vector<int64_t>& kernel_shape) const {
+	if (kernel_shape_.size() > 0) {
+		kernel_shape = kernel_shape_;
+		if (kernel_shape.size() + 2 != weight_shape.size())
+			throw std::runtime_error(
+				"kernel_shape num_dims is not compatible with W num_dims (1).");
+
+		for (size_t i = 0; i < kernel_shape.size(); ++i)
+			if (kernel_shape[i] != weight_shape[i + 2])
+				throw std::runtime_error(
+					"kernel_shape num_dims is not compatible with W num_dims (2).");
+	}
+	else {
+		auto& weight_dims = weight_shape;
+		kernel_shape = std::vector<int64_t>(weight_dims.begin() + 2, weight_dims.end());
+	}
+}
+
+
+void ConvPoolCommonShape::infer_output_shape(const std::vector<int64_t>& input_shape,
+	const std::vector<int64_t>& kernel_shape,
+	const std::vector<int64_t>& strides_p,
+	const std::vector<int64_t>& dilations_p,
+	std::vector<int64_t>& pads_p,
+	std::vector<int64_t>& output_shape,
+	bool ForceSymmetricAutoPadding) const {
+
+	size_t rank = input_shape.size();
+	int64_t dim_size;
+
+	for (size_t dim = 0; dim < rank; ++dim) {
+		if (dim >= strides_p.size() || dim >= kernel_shape.size() ||
+			dim >= dilations_p.size() || dim >= pads_p.size() ||
+			rank + dim >= pads_p.size())
+			throw std::runtime_error("Failure in infer_output_shape.");
+
+		dim_size = 0;
+		ComputePadAndOutputShape(
+			input_shape[dim], strides_p[dim], kernel_shape[dim],
+			dilations_p[dim], auto_pad_, &pads_p.at(dim),
+			&pads_p.at(input_shape.size() + dim),
+			&dim_size, ForceSymmetricAutoPadding);
+		if (dim_size <= 0)
+			throw std::runtime_error("Invalid argument in infer_output_shape.");
+		output_shape.push_back(dim_size);
+	}
+}
+
+
+void ConvPoolCommon::init(
+	const std::string& auto_pad,
+	py_array_t<int64_t> dilations,
+	int64_t group,
+	py_array_t<int64_t> kernel_shape,
+	py_array_t<int64_t> pads,
+	py_array_t<int64_t> strides) {
+	ConvPoolCommonShape::init(auto_pad, kernel_shape);
+	array2vector(dilations_, dilations, int64_t);
+	group_ = group;
+	array2vector(pads_, pads, int64_t);
+	array2vector(strides_, strides, int64_t);
+}
diff --git a/mlprodict/onnxrt/ops_cpu/op_conv_matrices_.hpp b/mlprodict/onnxrt/ops_cpu/op_conv_matrices_.hpp
index 8afbb8cd7..92d1bb935 100644
--- a/mlprodict/onnxrt/ops_cpu/op_conv_matrices_.hpp
+++ b/mlprodict/onnxrt/ops_cpu/op_conv_matrices_.hpp
@@ -7,372 +7,802 @@
 #define _CRT_SECURE_NO_WARNINGS
 #endif
 
+#include "op_common_.hpp"
+#define is_a_ge_zero_and_a_lt_b(a, b) (static_cast<uint64_t>(a) < static_cast<uint64_t>(b))
+
+
 #ifndef SKIP_PYTHON
 
+//#include <pybind11/iostream.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <pybind11/numpy.h>
+//#include <numpy/arrayobject.h>
+
 #if USE_OPENMP
 #include <omp.h>
 #endif
 
+namespace py = pybind11;
+
+#define py_array_t py::array_t
+#define py_array_style py::array::c_style | py::array::forcecast
+#define py_gil_scoped_release py::gil_scoped_release
+
+#else
+
+#include <cmath>
+#include <vector>
+#include <thread>
+#include <iterator>
+#include <iostream> // cout
+#include <sstream>
+#include <math.h>
+
+#define py_array_style int
+#define py_gil_scoped_release int
+
+template <typename T>
+bool cmp_vector(const std::vector<T>& v1, const std::vector<T>& v2) {
+	if (v1.size() != v2.size())
+		return false;
+	for (size_t i = 0; i < v1.size(); ++i)
+		if (v1[i] != v2[i])
+			return false;
+	return true;
+}
+
+template <typename T, typename cst = int>
+class py_array_t : public std::vector<T> {
+	std::vector<int64_t> shape_;
+public:
+	py_array_t() : std::vector<T>(), shape_() {}
+	py_array_t(const std::vector<int64_t>& shape) : std::vector<T>(), shape_(shape) {
+		int64_t s = 1;
+		for (auto it : shape)
+			s *= it;
+		this->resize(s);
+	}
+	py_array_t(const std::vector<T>& values, const std::vector<int64_t>& shape) :
+		std::vector<T>(values), shape_(shape) {
+		if (shape.empty() && !!values.empty()) {
+			shape_.resize(1);
+			shape_[0] = values.size();
+		}
+	}
+	const T* data(int64_t p = 0) const { return &((*this)[p]); }
+	int64_t ndim() const { return shape_.size(); }
+	int64_t shape(size_t i) const { return shape_[i]; }
+	bool equal(const py_array_t& value) {
+		return cmp_vector(shape_, value.shape_) && cmp_vector(*this, value);
+	}
+	bool operator == (T* ptr) {
+		if (ptr == nullptr)
+			return this->empty();
+		throw std::runtime_error("not implemented when ptr != nullptr");
+	}
+};
+
 #endif
 
-#include "op_common_.hpp"
 
+template <typename T>
+void TensorTranspose(const T* input, T* output, size_t M, size_t N) {
+	const T* ptr;
+	for (size_t i = 0; i < M; ++i) {
+		ptr = input + i * N;
+		for (size_t j = 0; j < N; ++j)
+			output[j * M + i] = ptr[j];
+	}
+}
 
-#define is_a_ge_zero_and_a_lt_b(a, b) (static_cast<uint64_t>(a) < static_cast<uint64_t>(b))
+
+template <typename T, typename TI = int32_t>
+void QConvDepthwise(const T** Input, TI InputZeroPoint, const T* Filter,
+	TI FilterZeroPoint, bool FilterIsSigned, TI* Output,
+	size_t Channels, size_t OutputCount, size_t KernelSize) {
+	// Signed version.
+	while (OutputCount > 0) {
+
+		size_t ChannelOffset = 0;
+		size_t c = Channels;
+
+		while (c > 0) {
+			int32_t Accumulator = 0;
+			size_t ChannelKernelOffset = ChannelOffset;
+
+			for (size_t k = 0; k < KernelSize; ++k) {
+				int32_t InputValue = int32_t(Input[k][ChannelOffset]) - InputZeroPoint;
+				int32_t FilterValue = int32_t(Filter[ChannelKernelOffset]) - FilterZeroPoint;
+				Accumulator += InputValue * FilterValue;
+				ChannelKernelOffset += Channels;
+			}
+
+			*Output++ = Accumulator;
+			++ChannelOffset;
+			--c;
+		}
+
+		Input += KernelSize;
+		--OutputCount;
+	}
+}
+
+
+// The function adds value to C, assuming this array
+// was initialized.
+template <typename NTYPE>
+void gemm(bool transA, bool transB,
+	size_t M, size_t N, size_t K, NTYPE alpha,
+	const NTYPE* A, const NTYPE* B, NTYPE beta,
+	NTYPE* C) {
+
+	if (transA) {
+		if (transB) {
+		}
+		else {
+			// a A B + b C, dimension = M * N
+			NTYPE* begin;
+			NTYPE val;
+			NTYPE val0;
+			size_t i, j, k, maxc = 0;
+			const NTYPE* pA, * pB;
+			for (i = 0, begin = C; i < M; ++i) {
+				for (j = 0; j < N; ++j, ++begin) {
+					val0 = *begin * beta;
+					val = 0;
+					pA = A + i;
+					pB = B + j;
+					for (k = K; k > 0; --k, pA += K, pB += N)
+						val += *pA * *pB;
+					*begin = val0 + val * alpha;
+					maxc = maxc > (size_t)(begin - C) ? maxc : (size_t)(begin - C);
+					if (maxc > M * N)
+						throw std::runtime_error("gemm10: maxc > M * N");
+				}
+			}
+			return;
+		}
+	}
+	else {
+		if (transB) {
+		}
+		else {
+			// a A B + b C, dimension = M * N
+			NTYPE* begin;
+			NTYPE val;
+			NTYPE val0;
+			size_t i, j, k, maxc = 0;
+			const NTYPE* pA, * pB;
+			for (i = 0, begin = C; i < M; ++i) {
+				for (j = 0; j < N; ++j, ++begin) {
+					val0 = *begin * beta;
+					val = 0;
+					pA = A + i * K;
+					pB = B + j;
+					for (k = K; k > 0; --k, ++pA, pB += N)
+						val += *pA * *pB;
+					*begin = val0 + val * alpha;
+					maxc = maxc > (size_t)(begin - C) ? maxc : (size_t)(begin - C);
+					if (maxc > M * N)
+						throw std::runtime_error("gemm00: maxc > M * N");
+				}
+			}
+			return;
+		}
+	}
+	throw std::runtime_error("Not implemented for transposed matrices (Gemm<T>).");
+}
+
+
+// NTYPE is uint8_t or int8_t
+template <typename NTYPE>
+void QGemm(bool transA, bool transB,
+	size_t M, size_t N, size_t K, NTYPE alpha,
+	const NTYPE* A, const NTYPE* B, NTYPE beta,
+	int32_t* C, size_t lda, size_t ldb, size_t ldc,
+	NTYPE ZeroPointA = 0,
+	const NTYPE* ZeroPointB = nullptr,
+	bool BIsPacked = false,
+	bool PerColumnZeroPoints = false) {
+
+	/*
+	struct GemmShapeParams {
+		size_t          M                   = 0;
+		size_t          N                   = 0;
+		size_t          K                   = 0;
+		bool            BIsSigned           = false};
+	struct GemmDataParams {
+		const uint8_t*  A                   = nullptr;
+		size_t          lda                 = 0;
+		uint8_t         ZeroPointA          = 0;
+		const void*     B                   = 0;
+		size_t          ldb                 = 0;
+		const uint8_t*  ZeroPointB          = nullptr;
+		bool            BIsPacked           = false;
+		bool            PerColumnZeroPoints = false;
+		int32_t*        C                   = nullptr;
+		size_t          ldc                 = 0};
+	*/
+	if (alpha != 1)
+		throw std::runtime_error("Not implemented for alpha != 1 (QGemm<T>).");
+	if (beta != 1)
+		throw std::runtime_error("Not implemented for beta != 1 (QGemm<T>).");
+	if (transA) {
+		if (transB) {
+		}
+		else {
+		}
+	}
+	else {
+		if (transB) {
+		}
+		else {
+			// a A B + b C, dimension = M * N
+			int32_t* begin;
+			NTYPE val;
+			NTYPE val0;
+			size_t i, j, k, maxc = 0;
+			const NTYPE* pA, * pB;
+			for (i = 0, begin = C; i < M; ++i) {
+				for (j = 0; j < N; ++j, ++begin) {
+					val0 = *begin; /* * beta;*/
+					val = 0;
+					pA = A + i * K;
+					pB = B + j;
+					for (k = K; k > 0; --k, ++pA, pB += N)
+						val += *pA * *pB;
+					*begin = val0 + val; /* * alpha;*/
+					maxc = maxc > (size_t)(begin - C) ? maxc : (size_t)(begin - C);
+					if (maxc > M * N)
+						throw std::runtime_error("qgemm00: maxc > M * N");
+				}
+			}
+			return;
+		}
+	}
+	throw std::runtime_error("Not implemented for transposed matrices (QGemm<T>).");
+}
 
 
 template <typename T>
 static void Im2colWithEqualPadding(
-        int64_t output_h, int64_t output_w, const T* data_im, int64_t channels,
-        int64_t height, int64_t width, int64_t kernel_h, int64_t kernel_w,
-        int64_t dilation_h, int64_t dilation_w, int64_t pad_t, int64_t pad_l,
-        int64_t stride_h, int64_t stride_w, T* data_col, T padding_value) {
-    // From Intel, https://github.com/BVLC/caffe/pull/3536
-    int64_t pad_h = pad_t;
-    int64_t pad_w = pad_l;
-    int64_t channel_size = height * width;
-    for (int64_t channel = channels; channel--; data_im += channel_size) {
-        for (int64_t kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
-            for (int64_t kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
-                int64_t input_row = -pad_h + kernel_row * dilation_h;
-                for (int64_t output_rows = output_h; output_rows; output_rows--) {
-                    if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
-                        std::fill_n(data_col, output_w, padding_value);
-                        data_col += output_w;
-                    }
-                    else {
-                        int64_t input_col = -pad_w + kernel_col * dilation_w;
-                        const T* rdptr = data_im + input_row * width + input_col;
-                        for (int64_t i = 0; i != output_w; ++i) {
-                            *data_col = is_a_ge_zero_and_a_lt_b(input_col, width)
-                                    ? rdptr[i * stride_w]
-                                    : padding_value;
-                            input_col += stride_w;
-                            ++data_col;
-                        }
-                    }
-                    input_row += stride_h;
-                }
-            }
-        }
-    }
+	int64_t output_h, int64_t output_w, const T* data_im, int64_t channels,
+	int64_t height, int64_t width, int64_t kernel_h, int64_t kernel_w,
+	int64_t dilation_h, int64_t dilation_w, int64_t pad_t, int64_t pad_l,
+	int64_t stride_h, int64_t stride_w, T* data_col, T padding_value) {
+	// From Intel, https://github.com/BVLC/caffe/pull/3536
+	int64_t pad_h = pad_t;
+	int64_t pad_w = pad_l;
+	int64_t channel_size = height * width;
+	for (int64_t channel = channels; channel--; data_im += channel_size) {
+		for (int64_t kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
+			for (int64_t kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
+				int64_t input_row = -pad_h + kernel_row * dilation_h;
+				for (int64_t output_rows = output_h; output_rows; output_rows--) {
+					if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
+						std::fill_n(data_col, output_w, padding_value);
+						data_col += output_w;
+					}
+					else {
+						int64_t input_col = -pad_w + kernel_col * dilation_w;
+						const T* rdptr = data_im + input_row * width + input_col;
+						for (int64_t i = 0; i != output_w; ++i) {
+							*data_col = is_a_ge_zero_and_a_lt_b(input_col, width)
+								? rdptr[i * stride_w]
+								: padding_value;
+							input_col += stride_w;
+							++data_col;
+						}
+					}
+					input_row += stride_h;
+				}
+			}
+		}
+	}
 }
 
 
 template <typename T>
 void Im2colNd_NCHW(
-        const T* data_img, const int64_t* im_shape,
-        const int64_t* col_shape, int64_t /*img_size*/,
-        int64_t /*col_size*/, const int64_t* kernel_shape,
-        const int64_t* stride, const int64_t* dilation,
-        const int64_t* pad, int64_t N, T* data_col,
-        bool accumulate_output = false,
-        T padding_value = 0) {
-    int64_t kernel_size = 1;
-    for (int64_t i = 0; i < N; ++i)
-        kernel_size *= kernel_shape[i];
-
-    int64_t channels_col = col_shape[0];
-    std::vector<int64_t> d_offset(N, 0);
-    std::vector<int64_t> d_iter(N, 0);
-
-    for (int64_t c_col = 0; c_col < channels_col; ++c_col) {
-        // Loop over spatial axes in reverse order to compute a per-axis offset.
-        int64_t offset = c_col;
-        for (int64_t d_i = N - 1; d_i >= 0; --d_i) {
-            if (d_i < N - 1)
-                offset /= kernel_shape[d_i + 1];
-            d_offset[d_i] = offset % kernel_shape[d_i];
-        }
-        for (bool incremented = true; incremented;) {
-            // Loop over spatial axes in forward order to compute the indices in the
-            // image and column, and whether the index lies in the padding.
-            int64_t index_col = c_col;
-            int64_t index_im = c_col / kernel_size;
-            bool is_padding = false;
-            for (int64_t d_i = 0; d_i < N; ++d_i) {
-                int64_t d = d_iter[d_i];
-                int64_t d_im = d * stride[d_i] - pad[d_i] + d_offset[d_i] * dilation[d_i];
-                is_padding |= d_im < 0 || d_im >= im_shape[d_i + 1];
-                index_col *= col_shape[d_i + 1];
-                index_col += d;
-                index_im *= im_shape[d_i + 1];
-                index_im += d_im;
-            }
-            if (!accumulate_output) {
-                if (is_padding)
-                    data_col[index_col] = padding_value;
-                else
-                    data_col[index_col] = data_img[index_im];
-            } 
-            else if (!is_padding)   // col2im
-                data_col[index_im] += data_img[index_col];
-
-            // Loop over spatial axes in reverse order to choose an index,
-            // like counting.
-            incremented = false;
-            for (int64_t d_i = N - 1; d_i >= 0; --d_i) {
-                int64_t d_max = col_shape[d_i + 1];
-                // ORT_ENFORCE(d_iter[d_i] < d_max);
-                if (d_iter[d_i] == d_max - 1)
-                    d_iter[d_i] = 0;
-                else {  // d_iter[d_i] < d_max - 1
-                    ++d_iter[d_i];
-                    incremented = true;
-                    break;
-                }
-            }
-        }  // while(incremented) {
-    }    // for (int c = 0; c < channels_col; ++c) {
+	const T* data_img, const int64_t* im_shape,
+	const int64_t* col_shape, int64_t /*img_size*/,
+	int64_t /*col_size*/, const int64_t* kernel_shape,
+	const int64_t* stride, const int64_t* dilation,
+	const int64_t* pad, int64_t N, T* data_col,
+	bool accumulate_output = false,
+	T padding_value = 0) {
+	int64_t kernel_size = 1;
+	for (int64_t i = 0; i < N; ++i)
+		kernel_size *= kernel_shape[i];
+
+	int64_t channels_col = col_shape[0];
+	std::vector<int64_t> d_offset(N, 0);
+	std::vector<int64_t> d_iter(N, 0);
+
+	for (int64_t c_col = 0; c_col < channels_col; ++c_col) {
+		// Loop over spatial axes in reverse order to compute a per-axis offset.
+		int64_t offset = c_col;
+		for (int64_t d_i = N - 1; d_i >= 0; --d_i) {
+			if (d_i < N - 1)
+				offset /= kernel_shape[d_i + 1];
+			d_offset[d_i] = offset % kernel_shape[d_i];
+		}
+		for (bool incremented = true; incremented;) {
+			// Loop over spatial axes in forward order to compute the indices in the
+			// image and column, and whether the index lies in the padding.
+			int64_t index_col = c_col;
+			int64_t index_im = c_col / kernel_size;
+			bool is_padding = false;
+			for (int64_t d_i = 0; d_i < N; ++d_i) {
+				int64_t d = d_iter[d_i];
+				int64_t d_im = d * stride[d_i] - pad[d_i] + d_offset[d_i] * dilation[d_i];
+				is_padding |= d_im < 0 || d_im >= im_shape[d_i + 1];
+				index_col *= col_shape[d_i + 1];
+				index_col += d;
+				index_im *= im_shape[d_i + 1];
+				index_im += d_im;
+			}
+			if (!accumulate_output) {
+				if (is_padding)
+					data_col[index_col] = padding_value;
+				else
+					data_col[index_col] = data_img[index_im];
+			}
+			else if (!is_padding)   // col2im
+				data_col[index_im] += data_img[index_col];
+
+			// Loop over spatial axes in reverse order to choose an index,
+			// like counting.
+			incremented = false;
+			for (int64_t d_i = N - 1; d_i >= 0; --d_i) {
+				int64_t d_max = col_shape[d_i + 1];
+				// ORT_ENFORCE(d_iter[d_i] < d_max);
+				if (d_iter[d_i] == d_max - 1)
+					d_iter[d_i] = 0;
+				else {  // d_iter[d_i] < d_max - 1
+					++d_iter[d_i];
+					incremented = true;
+					break;
+				}
+			}
+		}  // while(incremented) {
+	}    // for (int c = 0; c < channels_col; ++c) {
 }
 
 
 template<typename T>
 void Im2col_NCHW(
-        const T* data_im, int64_t channels,
-        int64_t height, int64_t width,
-        int64_t kernel_h, int64_t kernel_w,
-        int64_t dilation_h, int64_t dilation_w,
-        int64_t pad_t, int64_t pad_l, int64_t pad_b, int64_t pad_r,
-        int64_t stride_h, int64_t stride_w, T* data_col,
-        T padding_value = 0) {
-    const int64_t output_h =
-        (height + pad_b + pad_t - (dilation_h * (kernel_h - 1) + 1))
-        / stride_h + 1;
-    const int64_t output_w =
-        (width + pad_l + pad_r - (dilation_w * (kernel_w - 1) + 1))
-        / stride_w + 1;
-  
-    // Fast path for zero padding and no dilation
-    // From Torch, THNN_(unfolded_copy)
-    if (dilation_h == 1 && dilation_w == 1 && pad_l == 0 && pad_r == 0 &&
-            pad_t == 0 && pad_b == 0) {
-        for (auto k = 0; k < channels * kernel_h * kernel_w; k++) {
-            const auto nip = k / (kernel_h * kernel_w);
-            const auto rest = k % (kernel_h * kernel_w);
-            const auto kh = rest / kernel_w;
-            const auto kw = rest % kernel_w;
-            auto* dst = data_col + nip * (kernel_h * kernel_w * output_h * output_w) +
-                        kh * (kernel_w * output_h * output_w) + kw * (output_h * output_w);
-            const auto* src = data_im + nip * (height * width);
-            for (auto y = 0; y < output_h; y++) {
-                const auto iy = y * stride_h + kh;
-                const auto ix = kw;
-                if (stride_w == 1) {
-                    memcpy(
-                        dst + (y * output_w),
-                        src + (iy * width + ix),
-                        sizeof(T) * output_w);
-                } 
-                else {
-                    for (auto x = 0; x < output_w; x++) {
-                        memcpy(
-                            dst + (y * output_w + x),
-                            src + (iy * width + ix + x * stride_w),
-                            sizeof(T));
-                    }
-                }
-            }
-        }
-        return;
-    }
-
-    // Fast path for equal padding
-    if (pad_l == pad_r && pad_t == pad_b) {
-        Im2colWithEqualPadding(
-            output_h, output_w, data_im, channels, height, width,
-            kernel_h, kernel_w, dilation_h, dilation_w, pad_t, pad_l,
-            stride_h, stride_w, data_col, padding_value);
-        return;
-    }
-  
-    // Baseline
-    const int64_t dkernel_h = dilation_h * (kernel_h - 1) + 1;
-    const int64_t dkernel_w = dilation_w * (kernel_w - 1) + 1;
-  
-    int64_t height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
-    int64_t width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
-  
-    int64_t channels_col = channels * kernel_h * kernel_w;
-    for (int64_t c = 0; c < channels_col; ++c) {
-        int64_t w_offset = c % kernel_w;
-        int64_t h_offset = (c / kernel_w) % kernel_h;
-        int64_t c_im = c / kernel_h / kernel_w;
-        for (int64_t h = 0; h < height_col; ++h) {
-            for (int64_t w = 0; w < width_col; ++w) {
-                int64_t h_pad = h * stride_h - pad_t + h_offset * dilation_h;
-                int64_t w_pad = w * stride_w - pad_l + w_offset * dilation_w;
-                if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
-                    data_col[(c * height_col + h) * width_col + w] =
-                        data_im[(c_im * height + h_pad) * width + w_pad];
-                else
-                    data_col[(c * height_col + h) * width_col + w] = padding_value;
-            }
-        }
-    }
+	const T* data_im, int64_t channels,
+	int64_t height, int64_t width,
+	int64_t kernel_h, int64_t kernel_w,
+	int64_t dilation_h, int64_t dilation_w,
+	int64_t pad_t, int64_t pad_l, int64_t pad_b, int64_t pad_r,
+	int64_t stride_h, int64_t stride_w, T* data_col,
+	T padding_value = 0) {
+	const int64_t output_h =
+		(height + pad_b + pad_t - (dilation_h * (kernel_h - 1) + 1))
+		/ stride_h + 1;
+	const int64_t output_w =
+		(width + pad_l + pad_r - (dilation_w * (kernel_w - 1) + 1))
+		/ stride_w + 1;
+
+	// Fast path for zero padding and no dilation
+	// From Torch, THNN_(unfolded_copy)
+	if (dilation_h == 1 && dilation_w == 1 && pad_l == 0 && pad_r == 0 &&
+		pad_t == 0 && pad_b == 0) {
+		for (auto k = 0; k < channels * kernel_h * kernel_w; k++) {
+			const auto nip = k / (kernel_h * kernel_w);
+			const auto rest = k % (kernel_h * kernel_w);
+			const auto kh = rest / kernel_w;
+			const auto kw = rest % kernel_w;
+			auto* dst = data_col + nip * (kernel_h * kernel_w * output_h * output_w) +
+				kh * (kernel_w * output_h * output_w) + kw * (output_h * output_w);
+			const auto* src = data_im + nip * (height * width);
+			for (auto y = 0; y < output_h; y++) {
+				const auto iy = y * stride_h + kh;
+				const auto ix = kw;
+				if (stride_w == 1) {
+					memcpy(
+						dst + (y * output_w),
+						src + (iy * width + ix),
+						sizeof(T) * output_w);
+				}
+				else {
+					for (auto x = 0; x < output_w; x++) {
+						memcpy(
+							dst + (y * output_w + x),
+							src + (iy * width + ix + x * stride_w),
+							sizeof(T));
+					}
+				}
+			}
+		}
+		return;
+	}
+
+	// Fast path for equal padding
+	if (pad_l == pad_r && pad_t == pad_b) {
+		Im2colWithEqualPadding(
+			output_h, output_w, data_im, channels, height, width,
+			kernel_h, kernel_w, dilation_h, dilation_w, pad_t, pad_l,
+			stride_h, stride_w, data_col, padding_value);
+		return;
+	}
+
+	// Baseline
+	const int64_t dkernel_h = dilation_h * (kernel_h - 1) + 1;
+	const int64_t dkernel_w = dilation_w * (kernel_w - 1) + 1;
+
+	int64_t height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
+	int64_t width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
+
+	int64_t channels_col = channels * kernel_h * kernel_w;
+	for (int64_t c = 0; c < channels_col; ++c) {
+		int64_t w_offset = c % kernel_w;
+		int64_t h_offset = (c / kernel_w) % kernel_h;
+		int64_t c_im = c / kernel_h / kernel_w;
+		for (int64_t h = 0; h < height_col; ++h) {
+			for (int64_t w = 0; w < width_col; ++w) {
+				int64_t h_pad = h * stride_h - pad_t + h_offset * dilation_h;
+				int64_t w_pad = w * stride_w - pad_l + w_offset * dilation_w;
+				if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
+					data_col[(c * height_col + h) * width_col + w] =
+					data_im[(c_im * height + h_pad) * width + w_pad];
+				else
+					data_col[(c * height_col + h) * width_col + w] = padding_value;
+			}
+		}
+	}
+}
+
+
+// Loop over spatial axes in reverse order to choose an index, like counting.
+inline bool NextPosition(int64_t N, const int64_t* shape, int64_t* dims) {
+	bool has_next_output = false;
+	for (int64_t d_i = N - 1; d_i >= 0; --d_i) {
+		int64_t d_max = shape[d_i];
+		// assert dims[d_i] < d_max
+		if (dims[d_i] == d_max - 1) {
+			dims[d_i] = 0;
+		}
+		else {  // dims[d_i] < d_max - 1
+			++dims[d_i];
+			has_next_output = true;
+			break;
+		}
+	}
+	return has_next_output;
 }
 
 
 template <typename T>
-void ComputePadAndOutputShape(
-        const int64_t in_dim, const int64_t stride,
-        const int64_t kernel, const int64_t dilation,
-        AutoPadType pad_type, int64_t* pad_head,
-        int64_t* pad_tail, int64_t* out_dim,
-        bool ForceSymmetricAutoPadding) {
-
-    const int64_t dkernel = dilation * (kernel - 1) + 1;
-
-    if (pad_type == AutoPadType::NOTSET) {
-        *out_dim = static_cast<int64_t>(static_cast<float>(
-            in_dim + *pad_head + *pad_tail - dkernel) / stride + 1);
-    }
-    else {
-        switch (pad_type) {
-            case AutoPadType::VALID:
-                *pad_head = 0;
-                *pad_tail = 0;
-                *out_dim = (in_dim - dkernel) / stride + 1;
-                break;
-            case AutoPadType::SAME_UPPER:
-            case AutoPadType::SAME_LOWER: {
-                if (dilation != 1)
-                    throw std::runtime_error(
-                        "Dilation not supported for AutoPadType::SAME_UPPER or AutoPadType::SAME_LOWER.");
-                int64_t legacy_target_size = (in_dim + stride - 1) / stride;
-                int64_t pad_needed = (legacy_target_size - 1) * stride + kernel - in_dim;
-                *out_dim = (in_dim + pad_needed - dkernel) / stride + 1;
-
-                // make sure padding is symmetric
-                if (ForceSymmetricAutoPadding)
-                    pad_needed = roundUpPow2<int64_t, 2>(pad_needed);
-
-                *pad_head = (pad_type == AutoPadType::SAME_LOWER)
-                                ? (pad_needed + 1) / 2
-                                : pad_needed / 2;
-                *pad_tail = pad_needed - *pad_head;
-                } break;
-            default:
-                throw std::runtime_error("Invalid argument in ComputePadAndOutputShape.");
-        }
-    }
+void Im2col_NCHW(const T* data_im,
+	int64_t group_channels,
+	int64_t input_channels,
+	const int64_t* im_shape,
+	const int64_t* output_shape,
+	const int64_t* kernel_shape,
+	const int64_t* stride,
+	const int64_t* dilation,
+	const int64_t* pad,
+	ptrdiff_t rank,
+	T* data_col,
+	T padding_value) {
+	// iterate dimensions on output image shape (without Batch and Channel)
+	std::vector<int64_t> d_output(rank, 0);
+	// inner iterate dimensions on kernel shape (without output channel and input channel)
+	std::vector<int64_t> d_kernel(rank, 0);
+
+	// Loop over spatial axes along the output image shape
+	do {
+		// Loop over spatial axes in reverse order to choose an index on kernel dimensions
+		do {
+			// Loop over spatial axes in forward order to compute the indices in the image
+			// and the inner col, and whether the index lies in the padding.
+			int64_t index_im = 0;
+			bool is_padding = false;
+			for (ptrdiff_t d_i = 0; d_i < rank; ++d_i) {
+				int64_t d_im = d_output[d_i] * stride[d_i] - pad[d_i] + d_kernel[d_i] * dilation[d_i];
+				is_padding |= !is_a_ge_zero_and_a_lt_b(d_im, im_shape[d_i]);
+				index_im *= im_shape[d_i];
+				index_im += d_im;
+			}
+			index_im *= input_channels;
+
+			if (is_padding) {
+				data_col = std::fill_n(data_col, group_channels, padding_value);
+			}
+			else {
+				data_col = std::copy_n(data_im + index_im, group_channels, data_col);
+			}
+		} while (NextPosition(rank, kernel_shape, d_kernel.data()));
+	} while (NextPosition(rank, output_shape, d_output.data()));
 }
 
 
+
 template <typename T>
-void ComputeTransposePadAndOutputShape(
-      const int64_t in_size, const int64_t stride,
-      const int64_t kernel, const int64_t dilation,
-      const int64_t adj, AutoPadType pad_type,
-      int64_t* pad_head, int64_t* pad_tail,
-      int64_t* out_size) {
-    if (*out_size != -1) {
-        // total padding size
-        int64_t paddings = std::max<int64_t>(0, (in_size - 1) * stride + adj + (kernel - 1) * dilation + 1 - *out_size);
-        if (pad_type == AutoPadType::SAME_UPPER) {  // pad more on head when paddings are odd.
-            *pad_head = paddings - paddings / 2;
-            *pad_tail = paddings / 2;
-        }
-        else {
-            // for pad_type is NOTSET, SAME_LOWER or VALID
-            // set pad_head as paddings/2, pad_tail as paddings-paddings/2.
-            // That said, we pad more on tail when paddings are odd.
-            *pad_head = paddings / 2;
-            *pad_tail = paddings - paddings / 2;
-        }
-        return;
-    }
-    if (pad_type != AutoPadType::NOTSET) {
-        switch (pad_type) {
-            // We handle cases of AutoPadType::VALID and AutoPadType::SAME_UPPER/LOWER,
-            // the same way
-            case AutoPadType::VALID:
-            case AutoPadType::SAME_UPPER:
-            case AutoPadType::SAME_LOWER:
-                *pad_head = 0;
-                *pad_tail = 0;
-                *out_size = (in_size - 1) * stride + adj + (kernel - 1) * dilation + 1;
-                break;
-            default:
-              throw std::runtime_error("pad type not supported");
-        }
-    }
-    else {
-        *out_size = (in_size - 1) * stride + adj +
-                    (kernel - 1) * dilation + 1 -
-                    *pad_head - *pad_tail;
-    }
+void Im2col_NHWC(const T* data_im,
+	int64_t input_channels,
+	const int64_t* input_shape,
+	const int64_t* output_shape,
+	const int64_t* kernel_shape,
+	const int64_t* stride,
+	const int64_t* dilation,
+	const int64_t* pad,
+	ptrdiff_t rank,
+	int64_t output_start,
+	int64_t output_count,
+	T const ** data_indirection,
+	const T* padding_ptr) {
+	if (rank == 1) {
+		int64_t stride_w = stride[0];
+		int64_t kernel_w = kernel_shape[0];
+		int64_t dilation_w = dilation[0];
+		int64_t pad_l = pad[0];
+		int64_t input_w = input_shape[0];
+
+		int64_t ow = output_start * stride_w;
+
+		while (output_count--) {
+			int64_t iw = ow - pad_l;
+			for (int64_t kw = 0; kw < kernel_w; kw++) {
+				const T* data_ptr = data_im + iw * input_channels;
+				data_indirection[kw] = (is_a_ge_zero_and_a_lt_b(iw, input_w) ? data_ptr : padding_ptr);
+				iw += dilation_w;
+			}
+			data_indirection += kernel_w;
+			ow += stride_w;
+		}
+	}
+	else if (rank == 2) {
+		int64_t stride_h = stride[0];
+		int64_t stride_w = stride[1];
+		int64_t kernel_h = kernel_shape[0];
+		int64_t kernel_w = kernel_shape[1];
+		int64_t dilation_h = dilation[0];
+		int64_t dilation_w = dilation[1];
+		int64_t pad_t = pad[0];
+		int64_t pad_l = pad[1];
+		int64_t input_h = input_shape[0];
+		int64_t input_w = input_shape[1];
+		int64_t output_w = output_shape[1];
+
+		int64_t oh = (output_start / output_w) * stride_h;
+		int64_t ow = (output_start % output_w) * stride_w;
+		int64_t ow_end = output_w * stride_w;
+
+		while (output_count--) {
+			for (int64_t kh = 0; kh < kernel_h; kh++) {
+				int64_t ih = kh * dilation_h + oh - pad_t;
+				if (is_a_ge_zero_and_a_lt_b(ih, input_h)) {
+					int64_t ihw = ih * input_w;
+					int64_t iw = ow - pad_l;
+					for (int64_t kw = 0; kw < kernel_w; kw++) {
+						const T* data_ptr = data_im + (ihw + iw) * input_channels;
+						data_indirection[kw] = (is_a_ge_zero_and_a_lt_b(iw, input_w) ? data_ptr : padding_ptr);
+						iw += dilation_w;
+					}
+				}
+				else {
+					std::fill_n(data_indirection, kernel_w, padding_ptr);
+				}
+				data_indirection += kernel_w;
+			}
+			ow += stride_w;
+			if (ow == ow_end) {
+				oh += stride_h;
+				ow = 0;
+			}
+		}
+	}
+	else {
+		// iterate dimensions on output image shape (without Batch and Channel)
+		std::vector<int64_t> d_output(rank, 0);
+		// inner iterate dimensions on kernel shape (without output channel and input channel)
+		std::vector<int64_t> d_kernel(rank, 0);
+
+		// Skip ahead to the starting output index.
+		for (ptrdiff_t d_i = rank - 1; d_i >= 0; --d_i) {
+			d_output[d_i] = output_start % output_shape[d_i];
+			output_start /= output_shape[d_i];
+		}
+
+		while (output_count--) {
+			// Loop over spatial axes in reverse order to choose an index on kernel dimensions
+			do {
+				// Loop over spatial axes in forward order to compute the indices in the image
+				// and the inner col, and whether the index lies in the padding.
+				int64_t index_im = 0;
+				bool is_padding = false;
+				for (ptrdiff_t d_i = 0; d_i < rank; ++d_i) {
+					int64_t d_input = d_output[d_i] * stride[d_i] - pad[d_i] + d_kernel[d_i] * dilation[d_i];
+					is_padding |= !is_a_ge_zero_and_a_lt_b(d_input, input_shape[d_i]);
+					index_im *= input_shape[d_i];
+					index_im += d_input;
+				}
+				const T* data_ptr = data_im + index_im * input_channels;
+				*data_indirection++ = is_padding ? padding_ptr : data_ptr;
+			} while (NextPosition(rank, kernel_shape, d_kernel.data()));
+			// Loop over spatial axes along the output image shape
+			NextPosition(rank, output_shape, d_output.data());
+		}
+	}
 }
 
 
-// The function adds value to C, assuming this array
-// was initialized.
-template <typename NTYPE>
-void gemm(bool transA, bool transB,
-          size_t M, size_t N, size_t K, NTYPE alpha,
-          const NTYPE* A, const NTYPE* B, NTYPE beta,
-          NTYPE* C) {
-
-    if (transA) {
-        if (transB) {
-        }
-        else {
-            // a A B + b C, dimension = M * N
-            NTYPE* begin;
-            NTYPE val;
-            NTYPE val0;
-            size_t i, j, k, maxc=0;
-            const NTYPE *pA, *pB;
-            for(i = 0, begin = C; i < M; ++i) {
-                for(j = 0; j < N; ++j, ++begin) {
-                    val0 = *begin * beta;
-                    val = 0;
-                    pA = A + i;
-                    pB = B + j;
-                    for(k = K; k > 0; --k, pA += K, pB += N)
-                        val += *pA * *pB;
-                    *begin = val0 + val * alpha;
-                    maxc = maxc > (size_t)(begin - C) ? maxc : (size_t)(begin - C);
-                    if (maxc > M * N)
-                        throw std::runtime_error("gemm10: maxc > M * N");
-                }
-            }
-            return;
-        }
-    }
-    else {
-        if (transB) {
-        }
-        else {
-            // a A B + b C, dimension = M * N
-            NTYPE* begin;
-            NTYPE val;
-            NTYPE val0;
-            size_t i, j, k, maxc=0;
-            const NTYPE *pA, *pB;
-            for(i = 0, begin = C; i < M; ++i) {
-                for(j = 0; j < N; ++j, ++begin) {
-                    val0 = *begin * beta;
-                    val = 0;
-                    pA = A + i * K;
-                    pB = B + j;
-                    for(k = K; k > 0; --k, ++pA, pB += N)
-                        val += *pA * *pB;
-                    *begin = val0 + val * alpha;
-                    maxc = maxc > (size_t)(begin - C) ? maxc : (size_t)(begin - C);
-                    if (maxc > M * N)
-                        throw std::runtime_error("gemm00: maxc > M * N");
-                }
-            }
-            return;
-        }
-    }
-    throw std::runtime_error("Not implemented for transposed matrices.");
-}    
+template <typename T>
+void Im2col_NHWC(const T* data_im,
+	int64_t group_channels,
+	int64_t input_channels,
+	int64_t input_h,
+	int64_t input_w,
+	int64_t kernel_h,
+	int64_t kernel_w,
+	int64_t dilation_h,
+	int64_t dilation_w,
+	int64_t pad_t,
+	int64_t pad_l,
+	int64_t stride_h,
+	int64_t stride_w,
+	int64_t output_w,
+	int64_t output_start,
+	int64_t output_count,
+	T* data_col,
+	T padding_value) {
+	int64_t mh = output_start / output_w;
+	int64_t mw = output_start % output_w;
+	for (int64_t mz = output_start; mz < output_start + output_count; mz++) {
+		int64_t oh = mh * stride_h;
+		int64_t ow = mw * stride_w;
+
+		for (int64_t kh = 0; kh < kernel_h; kh++) {
+			int64_t ih = kh * dilation_h + oh - pad_t;
+
+			if (is_a_ge_zero_and_a_lt_b(ih, input_h)) {
+				int64_t iw = ow - pad_l;
+				if (dilation_w == 1 && group_channels == input_channels) {
+					int64_t kw = kernel_w;
+					while (kw > 0) {
+						if (is_a_ge_zero_and_a_lt_b(iw, input_w)) {
+							// Increase the copy count size to reduce the number of copy calls.
+							int64_t batch_w = std::min(kw, input_w - iw);
+							std::memcpy(data_col, data_im + (ih * input_w + iw) * group_channels,
+								static_cast<size_t>(sizeof(T) * batch_w * group_channels));
+							data_col += batch_w * group_channels;
+							iw += batch_w;
+							kw -= batch_w;
+						}
+						else {
+							data_col = std::fill_n(data_col, group_channels, padding_value);
+							iw++;
+							kw--;
+						}
+					}
+				}
+				else {
+					for (int64_t kw = 0; kw < kernel_w; kw++) {
+						if (is_a_ge_zero_and_a_lt_b(iw, input_w)) {
+							// N.B. Using std::memcpy helped here over std::copy_n when doing a
+							// transform for an image with a small number of group channels.
+							std::memcpy(data_col, data_im + (ih * input_w + iw) * input_channels,
+								static_cast<size_t>(sizeof(T) * group_channels));
+							data_col += group_channels;
+						}
+						else {
+							data_col = std::fill_n(data_col, group_channels, padding_value);
+						}
+						iw += dilation_w;
+					}
+				}
+			}
+			else {
+				data_col = std::fill_n(data_col, kernel_w * group_channels, padding_value);
+			}
+		}
+
+		if (++mw == output_w) {
+			++mh;
+			mw = 0;
+		}
+	}
+}
 
 
+void ComputePadAndOutputShape(
+	int64_t in_dim, int64_t stride,
+	int64_t kernel, int64_t dilation,
+	AutoPadType pad_type, int64_t* pad_head,
+	int64_t* pad_tail, int64_t* out_dim,
+	bool ForceSymmetricAutoPadding);
+
 template <typename T>
-class ConvPoolCommon {
+void ComputeTransposePadAndOutputShape(
+	int64_t in_size, int64_t stride,
+	int64_t kernel, int64_t dilation,
+	int64_t adj, AutoPadType pad_type,
+	int64_t* pad_head, int64_t* pad_tail,
+	int64_t* out_size) {
+	if (*out_size != -1) {
+		// total padding size
+		int64_t paddings = std::max<int64_t>(0, (in_size - 1) * stride + adj + (kernel - 1) * dilation + 1 - *out_size);
+		if (pad_type == AutoPadType::SAME_UPPER) {  // pad more on head when paddings are odd.
+			*pad_head = paddings - paddings / 2;
+			*pad_tail = paddings / 2;
+		}
+		else {
+			// for pad_type is NOTSET, SAME_LOWER or VALID
+			// set pad_head as paddings/2, pad_tail as paddings-paddings/2.
+			// That said, we pad more on tail when paddings are odd.
+			*pad_head = paddings / 2;
+			*pad_tail = paddings - paddings / 2;
+		}
+		return;
+	}
+	if (pad_type != AutoPadType::NOTSET) {
+		switch (pad_type) {
+			// We handle cases of AutoPadType::VALID and AutoPadType::SAME_UPPER/LOWER,
+			// the same way
+		case AutoPadType::VALID:
+		case AutoPadType::SAME_UPPER:
+		case AutoPadType::SAME_LOWER:
+			*pad_head = 0;
+			*pad_tail = 0;
+			*out_size = (in_size - 1) * stride + adj + (kernel - 1) * dilation + 1;
+			break;
+		default:
+			throw std::runtime_error("pad type not supported");
+		}
+	}
+	else {
+		*out_size = (in_size - 1) * stride + adj +
+			(kernel - 1) * dilation + 1 -
+			*pad_head - *pad_tail;
+	}
+}
+
+
+class ConvPoolCommonShape {
+
+protected:
+
+	AutoPadType auto_pad_;
+	std::vector<int64_t> kernel_shape_;
+
+public:
+
+	ConvPoolCommonShape() {}
+
+	void init(const std::string& auto_pad, py_array_t<int64_t> kernel_shape);
+	void compute_kernel_shape(const std::vector<int64_t>& weight_shape, std::vector<int64_t>& kernel_shape) const;
+
+	void infer_output_shape(
+		const std::vector<int64_t>& input_shape,
+		const std::vector<int64_t>& kernel_shape,
+		const std::vector<int64_t>& strides_p,
+		const std::vector<int64_t>& dilations_p,
+		std::vector<int64_t>& pads_p,
+		std::vector<int64_t>& output_shape,
+		bool ForceSymmetricAutoPadding) const;
 };
 
+class ConvPoolCommon : public ConvPoolCommonShape {
+
+protected:
+
+	std::vector<int64_t> dilations_;
+	int64_t group_;
+	std::vector<int64_t> pads_;
+	std::vector<int64_t> strides_;
+
+public:
+
+	void init(const std::string& auto_pad,
+		py_array_t<int64_t> dilations,
+		int64_t group,
+		py_array_t<int64_t> kernel_shape,
+		py_array_t<int64_t> pads,
+		py_array_t<int64_t> strides);
+};
 
diff --git a/mlprodict/onnxrt/ops_cpu/op_conv_transpose_.cpp b/mlprodict/onnxrt/ops_cpu/op_conv_transpose_.cpp
index 62c7019e6..7bba1b8de 100644
--- a/mlprodict/onnxrt/ops_cpu/op_conv_transpose_.cpp
+++ b/mlprodict/onnxrt/ops_cpu/op_conv_transpose_.cpp
@@ -23,16 +23,10 @@ namespace py = pybind11;
 
 
 template <typename T>
-class ConvTranspose : ConvPoolCommon<T> {
+class ConvTranspose : ConvPoolCommon {
     
-    private:
+    protected:
         
-        AutoPadType auto_pad_;    
-        std::vector<int64_t> dilations_;
-        int64_t group_;
-        std::vector<int64_t> kernel_shape_;
-        std::vector<int64_t> pads_;
-        std::vector<int64_t> strides_;
         std::vector<int64_t> output_padding_;
         std::vector<int64_t> output_shape_;
     
@@ -83,7 +77,7 @@ class ConvTranspose : ConvPoolCommon<T> {
 };
 
 template<typename T>
-ConvTranspose<T>::ConvTranspose() : ConvPoolCommon<T>() {
+ConvTranspose<T>::ConvTranspose() : ConvPoolCommon() {
 }
 
 
@@ -96,14 +90,8 @@ void ConvTranspose<T>::init(
             py::array_t<int64_t> pads,
             py::array_t<int64_t> strides,
             py::array_t<int64_t> output_padding,
-            py::array_t<int64_t> output_shape
-    ) {
-    auto_pad_ = to_AutoPadType(auto_pad);
-    array2vector(dilations_, dilations, int64_t);
-    group_ = group;        
-    array2vector(dilations_, dilations, int64_t);
-    array2vector(pads_, pads, int64_t);
-    array2vector(strides_, strides, int64_t);
+            py::array_t<int64_t> output_shape) {
+    ConvPoolCommon::init(auto_pad, dilations, group, kernel_shape, pads, strides);
     array2vector(output_padding_, output_padding, int64_t);
     array2vector(output_shape_, output_shape, int64_t);
 }
diff --git a/mlprodict/onnxrt/ops_cpu/op_max_pool_.cpp b/mlprodict/onnxrt/ops_cpu/op_max_pool_.cpp
index afb64c8e8..15d898fe4 100644
--- a/mlprodict/onnxrt/ops_cpu/op_max_pool_.cpp
+++ b/mlprodict/onnxrt/ops_cpu/op_max_pool_.cpp
@@ -23,18 +23,12 @@ namespace py = pybind11;
 
 
 template <typename T>
-class MaxPool : ConvPoolCommon<T> {
+class MaxPool : ConvPoolCommon {
     
     private:
         
-        AutoPadType auto_pad_;
         int64_t ceil_mode_;
         int64_t storage_order_;
-        std::vector<int64_t> pads_;
-        std::vector<int64_t> strides_;
-        std::vector<int64_t> kernel_shape_;
-        std::vector<int64_t> dilations_;
-    
         bool global_pooling_;
     
     public:
@@ -91,7 +85,7 @@ class MaxPool : ConvPoolCommon<T> {
 
 
 template<typename T>
-MaxPool<T>::MaxPool() : ConvPoolCommon<T>() {
+MaxPool<T>::MaxPool() : ConvPoolCommon() {
     global_pooling_ = false;
 }
 
@@ -104,16 +98,10 @@ void MaxPool<T>::init(
             int64_t storage_order,
             py::array_t<int64_t> kernel_shape,
             py::array_t<int64_t> pads,
-            py::array_t<int64_t> strides
-    ) {
-    auto_pad_ = to_AutoPadType(auto_pad);
-    array2vector(dilations_, dilations, int64_t);
+            py::array_t<int64_t> strides) {
+    ConvPoolCommon::init(auto_pad, dilations, 0, kernel_shape, pads, strides);
     ceil_mode_ = ceil_mode;        
     storage_order_ = storage_order;        
-    array2vector(dilations_, dilations, int64_t);
-    array2vector(pads_, pads, int64_t);
-    array2vector(strides_, strides, int64_t);
-    array2vector(kernel_shape_, kernel_shape, int64_t);
 }
 
 
diff --git a/mlprodict/onnxrt/ops_cpu/op_qlinear_conv.py b/mlprodict/onnxrt/ops_cpu/op_qlinear_conv.py
new file mode 100644
index 000000000..5317e7a8f
--- /dev/null
+++ b/mlprodict/onnxrt/ops_cpu/op_qlinear_conv.py
@@ -0,0 +1,58 @@
+# -*- encoding: utf-8 -*-
+# pylint: disable=E0203,E1101,C0111
+"""
+@file
+@brief Runtime operator.
+"""
+import numpy
+from ._op import OpRun
+from ..shape_object import ShapeObject
+from .op_qlinear_conv_ import QLinearConvInt8, QLinearConvUInt8  # pylint: disable=E0611,E0401
+
+
+class QLinearConv(OpRun):
+
+    atts = {'auto_pad': 'NOTSET',
+            'group': 1,
+            'dilations': [],
+            'kernel_shape': [],
+            'pads': [],
+            'strides': []}
+
+    def __init__(self, onnx_node, desc=None, **options):
+        OpRun.__init__(self, onnx_node, desc=desc,
+                       expected_attributes=QLinearConv.atts,
+                       **options)
+        self._init()
+        self._cstu8 = numpy.array([], dtype=numpy.uint8)
+        self._csti8 = numpy.array([], dtype=numpy.int8)
+
+    def _init(self):
+        self.rtu8_ = QLinearConvUInt8()
+        self.rti8_ = QLinearConvInt8()
+        for rt in [self.rtu8_, self.rti8_]:
+            rt.init(self.auto_pad,
+                    numpy.array(self.dilations, dtype=numpy.int64),
+                    self.group,
+                    numpy.array(self.kernel_shape, dtype=numpy.int64),
+                    numpy.array(self.pads, dtype=numpy.int64),
+                    numpy.array(self.strides, dtype=numpy.int64))
+
+    def _run(self, X, x_scale, x_zero_point, w, w_scale, w_zero_point,  # pylint: disable=W0221
+             y_scale, y_zero_point, B=None):
+        if X is None:
+            raise ValueError(  # pragma: no cover
+                "X cannot be None for operator %r, ONNX=%r" % (
+                    type(self), self.onnx_node))
+        if X.dtype == numpy.uint8:
+            return (self.rtu8_.compute(
+                X, x_scale, x_zero_point, w, w_scale, w_zero_point,  # pylint: disable=W0221
+                y_scale, y_zero_point, B or self._cstu8), )
+        return (self.rti8_.compute(
+            X, x_scale, x_zero_point, w, w_scale, w_zero_point,  # pylint: disable=W0221
+            y_scale, y_zero_point, B or self._csti8), )
+
+    def _infer_shapes(self, X, x_scale, x_zero_point, w, w_scale,  # pylint: disable=W0221
+                      w_zero_point, y_scale, y_zero_point, B=None):
+
+        return (ShapeObject(None, dtype=X.dtype), )
diff --git a/mlprodict/onnxrt/ops_cpu/op_qlinear_conv_.cpp b/mlprodict/onnxrt/ops_cpu/op_qlinear_conv_.cpp
new file mode 100644
index 000000000..2e905043e
--- /dev/null
+++ b/mlprodict/onnxrt/ops_cpu/op_qlinear_conv_.cpp
@@ -0,0 +1,61 @@
+// Inspired from 
+// https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/core/providers/cpu/ml/tree_ensemble_classifier.cc.
+
+#if !defined(_CRT_SECURE_NO_WARNINGS)
+#define _CRT_SECURE_NO_WARNINGS
+#endif
+
+
+#include "op_qlinear_conv_.hpp"
+
+
+#ifndef SKIP_PYTHON
+
+
+class QLinearConvInt8 : public QLinearConv<int8_t> {
+public:
+	QLinearConvInt8() : QLinearConv<int8_t>() {}
+};
+
+
+class QLinearConvUInt8 : public QLinearConv<uint8_t> {
+public:
+	QLinearConvUInt8() : QLinearConv<uint8_t>() {}
+};
+
+
+PYBIND11_MODULE(op_qlinear_conv_, m) {
+	m.doc() =
+#if defined(__APPLE__)
+		"Implements Conv operator."
+#else
+		R"pbdoc(Implements runtime for operator QLinearConv. The code is inspired from
+`conv.cc <https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/core/providers/cpu/nn/qlinearconv.cc>`_
+in :epkg:`onnxruntime`.)pbdoc"
+#endif
+;
+
+	py::class_<QLinearConvUInt8> clf(m, "QLinearConvUInt8",
+		R"pbdoc(Implements uint8 runtime for operator QLinearConvUInt8. The code is inspired from
+`qlinearconv.cc <https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/core/providers/cpu/nn/qlinearconv.cc>`_
+in :epkg:`onnxruntime`. Supports uint8 only.)pbdoc");
+
+	clf.def(py::init<>());
+	clf.def("init", &QLinearConvUInt8::init,
+		"Initializes the runtime with the ONNX attributes.");
+	clf.def("compute", &QLinearConvUInt8::compute,
+		"Computes the output for operator QLinearConv.");
+
+	py::class_<QLinearConvInt8> cld(m, "QLinearConvInt8",
+		R"pbdoc(Implements int8 runtime for operator QLinearConv. The code is inspired from
+`qlinearconv.cc <https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/core/providers/cpu/nn/qlinearconv.cc>`_
+in :epkg:`onnxruntime`. Supports int8 only.)pbdoc");
+
+	cld.def(py::init<>());
+	cld.def("init", &QLinearConvInt8::init,
+		"Initializes the runtime with the ONNX attributes.");
+	cld.def("compute", &QLinearConvInt8::compute,
+		"Computes the output for operator QLinearConv.");
+}
+
+#endif
diff --git a/mlprodict/onnxrt/ops_cpu/op_qlinear_conv_.hpp b/mlprodict/onnxrt/ops_cpu/op_qlinear_conv_.hpp
new file mode 100644
index 000000000..2d745f899
--- /dev/null
+++ b/mlprodict/onnxrt/ops_cpu/op_qlinear_conv_.hpp
@@ -0,0 +1,571 @@
+#pragma once
+
+// Inspired from 
+// https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/core/providers/cpu/ml/tree_ensemble_classifier.cc.
+
+#if !defined(_CRT_SECURE_NO_WARNINGS)
+#define _CRT_SECURE_NO_WARNINGS
+#endif
+
+#include "op_conv_matrices_.hpp"
+
+#define ROUNDING_BIAS_MAGIC                    12582912.f
+#define ROUNDING_BIAS_MAGIC_BITS               0x4B400000
+
+
+template <typename T>
+inline bool is_signed() { return true; }
+
+template <>
+inline bool is_signed<uint8_t>() { return false; }
+
+
+inline uint32_t BitsOfFp32(float FloatValue) {
+	union {
+		uint32_t IntegerValue;
+		float FloatValue;
+	} u;
+	u.FloatValue = FloatValue;
+	return u.IntegerValue;
+}
+
+
+
+/**
+* This routine requantizes the intermediate buffer to the output buffer
+* optionally adding the supplied bias.
+* Parameters:
+* * Input - Supplies the input matrix.
+* * Output - Supplies the output matrix.
+* * Bias - Supplies the optional bias vector to be added to the input buffer
+*         before requantization.
+* * Buffer - Supplies the output matrix.
+* * M - Supplies the number of elements of the bias vector and the number of
+*         rows in the output matrix.
+* * N - Supplies the number of columns of the output matrix.
+* * Scale - Supplies the quantization scale.
+* * PerColumnScale - Supplies true if the quantization scale has per-column
+*         values, else false if a single quantization scale applies to the
+*         entire matrix.
+* * ZeroPoint - Supplies the quantization zero point value.
+*/
+template <typename T>
+void RequantizeOutput(
+	const int32_t* Input,
+	T* Output,
+	const int32_t* Bias,
+	size_t M,
+	size_t N,
+	const float* Scale,
+	bool PerColumnScale,
+	T ZeroPoint) {
+	const float PerMatrixScaleValue = PerColumnScale ? 0.0f : *Scale;
+	const float MinimumValue = float(0 - ZeroPoint);
+	const float MaximumValue = float(255 - ZeroPoint);
+
+	//
+	// Step through each row of the output matrix.
+	//
+
+	while (M-- > 0) {
+
+		const int32_t* bias = Bias;
+		const float* scale = Scale;
+		int32_t IntegerValue;
+		size_t n = N;
+
+		while (n > 0) {
+
+			IntegerValue = *Input++;
+			if (bias != nullptr)
+				IntegerValue += *bias++;
+
+			float FloatValue = float(IntegerValue);
+			float ScaleValue = PerColumnScale ? *scale++ : PerMatrixScaleValue;
+
+			FloatValue *= ScaleValue;
+			FloatValue = std::max(FloatValue, MinimumValue);
+			FloatValue = std::min(FloatValue, MaximumValue);
+
+			//
+			// Use the fast rounding trick adapted from XNNPACK: bias the floating
+			// point value by the first floating point value that has no
+			// fractional bits. The add operation performs the "round to nearest
+			// even". Extract the mantissa bits from this floating point value to
+			// obtain the rounded integer value.
+			//
+
+			IntegerValue = int32_t(BitsOfFp32(FloatValue + ROUNDING_BIAS_MAGIC)) - ROUNDING_BIAS_MAGIC_BITS;
+			*Output++ = T(IntegerValue + ZeroPoint);
+			n -= 1;
+		}
+	}
+}
+
+
+template <typename T>
+class QLinearConv : public ConvPoolCommon {
+
+private:
+
+	std::vector<int64_t> W_shape_;
+	T* packed_W_buffer_;
+	size_t packed_W_size_;
+	T* reordered_W_buffer_;
+	bool is_W_signed_;
+	bool is_W_packed_;
+	bool channels_last_;
+
+public:
+
+	typedef const T* T_const_ptr;
+
+	QLinearConv() : ConvPoolCommon() {}
+
+	void init(
+		const std::string& auto_pad,
+		py_array_t<int64_t> dilations,
+		int64_t group,
+		py_array_t<int64_t> kernel_shape,
+		py_array_t<int64_t> pads,
+		py_array_t<int64_t> strides) {
+		ConvPoolCommon::init(auto_pad, dilations, group, kernel_shape, pads, strides);
+		is_W_signed_ = is_signed<T>();
+		channels_last_ = false;
+		reordered_W_buffer_ = nullptr;
+		packed_W_buffer_ = nullptr;
+		is_W_packed_ = false;
+		packed_W_size_ = 0;
+	}
+
+	py_array_t<T> compute(
+		py_array_t<T, py_array_style> X,
+		float x_scale, T x_zero_point,
+		py_array_t<T, py_array_style> W,
+		py_array_t<float, py_array_style> w_scale,
+		T w_zero_point,
+		float y_scale, T y_zero_point,
+		py_array_t<T, py_array_style> B) const {
+
+		std::vector<int64_t> x_dims;
+		arrayshape2vector(x_dims, X);
+		std::vector<int64_t> w_dims;
+		arrayshape2vector(w_dims, W);
+
+		const int64_t N = x_dims[0];
+		const int64_t M = w_dims[0];
+		const auto& W_shape = M > 0 ? w_dims : W_shape_;
+		const bool is_W_signed = is_W_signed_;
+		//const Tensor* W = is_W_packed_ ? nullptr : context->Input<Tensor>(3);
+
+		std::vector<float> output_scales;
+		std::vector<int64_t> W_scale_shape;
+		arrayshape2vector(W_scale_shape, w_scale);
+		const int64_t W_scale_size = flattened_dimension(W_scale_shape);
+		const float* W_scale_data = w_scale.data();
+		output_scales.resize(static_cast<size_t>(W_scale_size));
+		for (int64_t i = 0; i < W_scale_size; i++)
+			output_scales[i] = (x_scale * W_scale_data[i] / y_scale);
+
+		std::vector<int64_t> kernel_shape;
+		compute_kernel_shape(w_dims, kernel_shape);
+		const size_t kernel_rank = kernel_shape.size();
+		std::vector<int64_t> pads(pads_);
+		if (pads.empty())
+			pads.resize(kernel_rank * 2, 0);
+		std::vector<int64_t> dilations(dilations_);
+		if (dilations.empty())
+			dilations.resize(kernel_rank, 1);
+		std::vector<int64_t> strides(strides_);
+		if (strides.empty())
+			strides.resize(kernel_rank, 1);
+
+		const int64_t C = x_dims[channels_last_ ? 1 + kernel_rank : 1];
+		const size_t spatial_dim_start = channels_last_ ? 1 : 2;
+		const size_t spatial_dim_end = spatial_dim_start + kernel_rank;
+		std::vector<int64_t> y_dims({ N });
+		if (!channels_last_)
+			y_dims.push_back(M);
+
+		std::vector<int64_t> sliced_input_shape(spatial_dim_end - spatial_dim_start);
+		for (size_t i = 0; i < sliced_input_shape.size(); ++i)
+			sliced_input_shape[i] = x_dims[i + spatial_dim_start];
+
+		infer_output_shape(sliced_input_shape, kernel_shape, strides, dilations, pads, y_dims, false);
+		if (channels_last_)
+			y_dims.push_back(M);
+
+		std::vector<int64_t> output_shape(y_dims.begin() + spatial_dim_start, y_dims.begin() + spatial_dim_end);
+		py_array_t<T> Y(y_dims);
+		{
+			py_gil_scoped_release release;
+			compute_gil_free(X, W, B, Y,
+				sliced_input_shape, output_shape,
+				kernel_shape, pads, dilations, strides,
+				x_dims, y_dims, w_dims,
+				x_scale, x_zero_point,
+				w_scale, w_zero_point,
+				y_scale, y_zero_point,
+				output_scales);
+		}
+		return Y;
+	}
+
+private:
+
+	bool HasStridesOneAndNoPadding() const {
+		if (std::all_of(strides_.begin(), strides_.end(), [](int64_t v) { return v == 1; })) {
+			if (std::all_of(pads_.begin(), pads_.end(), [](int64_t v) { return v == 0; }))
+				return true;
+		}
+		return false;
+	}
+
+	void ReorderFilter(
+		const T* input, T* output,
+		size_t output_channels,
+		size_t input_channels,
+		size_t kernel_size) const {
+		for (size_t k = 0; k < kernel_size; k++) {
+			for (size_t ic = 0; ic < input_channels; ic++) {
+				for (size_t oc = 0; oc < output_channels; oc++) {
+					size_t index = (oc * input_channels * kernel_size) + (ic * kernel_size) + k;
+					*output++ = input[index];
+				}
+			}
+		}
+	}
+
+
+	void compute_gil_free(
+		py_array_t<T> X, py_array_t<T> W,
+		py_array_t<T> B, py_array_t<T>& Y,
+		const std::vector<int64_t>& input_shape,
+		const std::vector<int64_t>& output_shape,
+		const std::vector<int64_t>& kernel_shape,
+		const std::vector<int64_t>& pads,
+		const std::vector<int64_t>& dilations,
+		const std::vector<int64_t>& strides,
+		const std::vector<int64_t>& x_dims,
+		const std::vector<int64_t>& y_dims,
+		const std::vector<int64_t>& w_dims,
+		float x_scale, T x_zero_point,
+		py_array_t<float> w_scale, T w_zero_point,
+		float y_scale, T y_zero_point,
+		const std::vector<float>& output_scales) const {
+
+		// see https://github.com/microsoft/onnxruntime/pull/7885/
+		std::vector<int64_t> b_dims;
+		arrayshape2vector(b_dims, B);
+
+		const int64_t N = x_dims[0];
+		const int64_t C = x_dims[1];
+		const int64_t M = w_dims[0];
+
+		const int64_t input_image_size = flattened_dimension(input_shape);
+		const int64_t output_image_size = flattened_dimension(output_shape);
+		const int64_t kernel_size = flattened_dimension(kernel_shape);
+
+		// Handle the case of a dynamic weight filter.
+		T* reordered_W_buffer = nullptr;
+		T* reordered_W = nullptr;
+		if (!packed_W_buffer_) {
+			if (w_dims.empty()) {
+				// Weight was constant and reordered.
+				reordered_W = reordered_W_buffer_;
+			}
+			else {
+				// Weight tensor was not constant or prepacking is disabled.
+				reordered_W = new T[flattened_dimension(w_dims)];
+				reordered_W_buffer = reordered_W;
+				ReorderFilter(W.data(),
+					reordered_W,
+					static_cast<size_t>(M),
+					static_cast<size_t>(w_dims[1]),
+					static_cast<size_t>(kernel_size));
+			}
+		}
+
+		int64_t group_count = group_;
+		int64_t group_input_channels = w_dims[1];
+		int64_t group_output_channels = M / group_count;
+
+		// Test for depthwise convolution.
+		const bool is_depthwise_conv = (reordered_W != nullptr && group_input_channels == 1 && group_output_channels == 1);
+		if (is_depthwise_conv) {
+			// Update the input and output channels to the number of groups in order to
+			// reuse as much of the below standard convolution path.
+			group_input_channels = group_count;
+			group_output_channels = group_count;
+			group_count = 1;
+		}
+
+		const int64_t X_offset = C * input_image_size;
+		const int64_t Y_offset = M * output_image_size;
+		const int64_t kernel_dim = group_input_channels * kernel_size;
+		const int64_t col_buffer_size = kernel_dim * output_image_size;
+
+		// Use an intermediate int32_t buffer for the GEMM computation before
+		// requantizing to the output type.
+		int32_t* gemm_output_data = new int32_t[Y_offset];
+		int32_t* gemm_output_buffer = gemm_output_data;
+		int32_t* gemm_output = gemm_output_data;
+
+		const T* Xdata = X.data();
+		const int32_t* Bdata = (int32_t*)(b_dims.size() > 0 ? B.data() : nullptr);
+		T* Ydata = (T*)Y.data();
+
+		T* transpose_input_buffer = nullptr;
+		T* transpose_output_buffer = nullptr;
+		T* transpose_input = nullptr;
+		T* transpose_output = nullptr;
+		if (!channels_last_) {
+			transpose_input = new T[X_offset];
+			transpose_input_buffer = transpose_input;
+			transpose_output = new T[Y_offset];
+			transpose_output_buffer = transpose_output;
+		}
+
+		T_const_ptr* col_buffer = nullptr;
+		T_const_ptr* col_data = nullptr;
+		std::vector<T> padding_data;
+		const size_t kernel_rank = kernel_shape.size();
+
+		if (is_depthwise_conv) {
+			// Allocate indirection buffer pointers and prepare a padding vector for
+			// the im2col transform.
+			col_data = new T_const_ptr[kernel_size * output_image_size];
+			col_buffer = col_data;
+			padding_data.resize(static_cast<size_t>(C), x_zero_point);
+		}
+		else if (kernel_size != 1 || !HasStridesOneAndNoPadding()) {
+			// Pointwise convolutions can use the original input tensor in place,
+			// otherwise a temporary buffer is required for the im2col transform.
+			int64_t group_col_buffer_size = (kernel_rank > 2) ? group_count * col_buffer_size : col_buffer_size;
+			col_data = new T_const_ptr[group_col_buffer_size];
+			col_buffer = col_data;
+		}
+
+		// See onnxruntime.
+		int32_t maximum_thread_count = 16;
+		constexpr double thread_complexity = static_cast<double>(64 * 1024);
+		const double complexity = static_cast<double>(output_image_size) *
+			static_cast<double>(group_output_channels) *
+			static_cast<double>(kernel_dim);
+
+
+		// OMP
+#if false //USE_OPENMP
+		int32_t thread_count = maximum_thread_count;
+		if (complexity < thread_complexity * maximum_thread_count)
+			thread_count = static_cast<int32_t>(complexity / thread_complexity) + 1;
+		// Ensure that every thread produces at least one output.
+		if (thread_count > output_image_size)
+			thread_count = static_cast<int32_t>(output_image_size);
+		thread_count = std::min(thread_count, ::omp_get_max_threads());
+#else
+		int32_t thread_count = 1;
+#endif
+
+		for (int64_t image_id = 0; image_id < N; ++image_id) {
+			const T* input_data = Xdata;
+			T* output_data = Ydata;
+
+			if (!channels_last_) {
+				// Transpose the input from channels first (NCHW) to channels last (NHWC).
+				TensorTranspose(
+					Xdata,
+					transpose_input_buffer,
+					static_cast<size_t>(C),
+					static_cast<size_t>(input_image_size));
+				input_data = transpose_input_buffer;
+				output_data = transpose_output_buffer;
+			}
+
+			// Threaded implementation of ND convolution is not yet supported, so
+			// prepare all im2col transformations here.
+			if (!is_depthwise_conv && col_buffer && kernel_rank > 2) {
+				for (int64_t group_id = 0; group_id < group_count; ++group_id) {
+					Im2col_NCHW<T>(
+						input_data + group_id * group_input_channels,
+						group_input_channels,
+						C,
+						input_shape.data(),
+						output_shape.data(),
+						kernel_shape.data(),
+						strides.data(),
+						dilations.data(),
+						pads.data(),
+						static_cast<int64_t>(kernel_rank),
+						((T*)col_buffer) + group_id * col_buffer_size,  // sizeof(T*) > sizeof(T)
+						x_zero_point);
+				}
+			}
+
+#if false // USE_OPENMP
+#pragma omp parallel for
+#endif
+			for (int32_t batch_idx = 0; batch_idx < thread_count; ++batch_idx) {
+				int64_t output_start, output_end;
+				std::ptrdiff_t work_per_batch = output_image_size / thread_count;
+				std::ptrdiff_t work_per_batch_extra = output_image_size % thread_count;
+				if (batch_idx < work_per_batch_extra) {
+					output_start = (work_per_batch + 1) * batch_idx;
+					output_end = output_start + work_per_batch + 1;
+				}
+				else {
+					output_start = work_per_batch * batch_idx + work_per_batch_extra;
+					output_end = output_start + work_per_batch;
+				}
+				int64_t output_count = output_end - output_start;
+
+				int32_t* worker_gemm_output = gemm_output + output_start * M;
+				T* worker_requantize_output = output_data + output_start * M;
+
+				if (is_depthwise_conv) {
+					T_const_ptr* worker_col_buffer = col_buffer + output_start * kernel_size;
+					Im2col_NHWC<T>(
+						input_data,
+						C,
+						input_shape.data(),
+						output_shape.data(),
+						kernel_shape.data(),
+						strides.data(),
+						dilations.data(),
+						pads.data(),
+						static_cast<ptrdiff_t>(kernel_rank),
+						output_start,
+						output_count,
+						worker_col_buffer,
+						padding_data.data());
+					QConvDepthwise<T, int32_t>(
+						worker_col_buffer,
+						x_zero_point,
+						reordered_W,
+						w_zero_point,
+						is_signed<T>(),
+						worker_gemm_output,
+						static_cast<size_t>(M),
+						static_cast<size_t>(output_count),
+						static_cast<size_t>(kernel_size));
+				}
+				else {
+					for (int64_t group_id = 0; group_id < group_count; ++group_id) {
+						// Prepare the im2col transformation or use the input buffer directly for
+						// pointwise convolutions.
+						const T* worker_gemm_input;
+						if (col_buffer) {
+							T* worker_col_buffer = ((T*)col_buffer) + output_start * kernel_dim;
+							if (kernel_rank == 2) {
+								Im2col_NHWC<T>(
+									input_data + group_id * group_input_channels,
+									group_input_channels,
+									C,
+									input_shape[0],
+									input_shape[1],
+									kernel_shape[0],
+									kernel_shape[1],
+									dilations[0],
+									dilations[1],
+									pads[0],
+									pads[1],
+									strides[0],
+									strides[1],
+									output_shape[1],
+									output_start,
+									output_count,
+									worker_col_buffer,
+									x_zero_point);
+							}
+							else if (kernel_rank == 1) {
+								Im2col_NHWC<T>(
+									input_data + group_id * group_input_channels,
+									group_input_channels,
+									C,
+									1,
+									input_shape[0],
+									1,
+									kernel_shape[0],
+									1,
+									dilations[0],
+									0,
+									pads[0],
+									1,
+									strides[0],
+									output_shape[0],
+									output_start,
+									output_count,
+									worker_col_buffer,
+									x_zero_point);
+							}
+							else {
+								// Use the im2col buffer prepared outside the thread, indexed by group.
+								worker_col_buffer += group_id * col_buffer_size;
+							}
+							worker_gemm_input = worker_col_buffer;
+						}
+						else {
+							worker_gemm_input = input_data + output_start * kernel_dim;
+						}
+
+						size_t ldb = 0;
+						const T* ptrB;
+						bool BIsPacked = false;
+						if (packed_W_buffer_) {
+							ptrB = static_cast<const T*>(packed_W_buffer_) + group_id * packed_W_size_,
+								BIsPacked = true;
+						}
+						else {
+							ptrB = reordered_W + group_id * group_output_channels,
+								ldb = static_cast<size_t>(M);
+						}
+
+						QGemm<T>(
+							false, false,
+							static_cast<size_t>(output_count),  // M
+							static_cast<size_t>(group_output_channels),  // N
+							static_cast<size_t>(kernel_dim), 1,  // K, alpha
+							worker_gemm_input, ptrB, 1,  // A, B, beta
+							worker_gemm_output + group_id * group_output_channels,  // C
+							static_cast<size_t>(kernel_dim), ldb, static_cast<size_t>(M),  // lda, ldb, ldc
+							x_zero_point, &w_zero_point,  // ZeroPointA, ZeroPointB
+							BIsPacked, false);  // BIsPacked, PerColumnZeroPoints
+					}
+				}
+
+				RequantizeOutput(
+					worker_gemm_output,
+					worker_requantize_output,
+					Bdata,
+					static_cast<size_t>(output_count),
+					static_cast<size_t>(M),
+					output_scales.data(),
+					output_scales.size() > 1,
+					y_zero_point);
+			}
+
+			if (!channels_last_) {
+				// Transpose the output from channels last (NHWC) to channels first (NCHW).
+				TensorTranspose(
+					output_data,
+					Ydata,
+					static_cast<size_t>(output_image_size),
+					static_cast<size_t>(M));
+			}
+
+			Xdata += X_offset;
+			Ydata += Y_offset;
+
+		}
+
+		if (transpose_input != nullptr)
+			delete[] transpose_input;
+		if (transpose_output != nullptr)
+			delete[] transpose_output;
+		if (reordered_W_buffer != nullptr)
+			delete[] reordered_W_buffer;
+		if (col_data != nullptr)
+			delete[] col_data;
+	}
+};
+
diff --git a/mlprodict/onnxrt/ops_cpu/op_where.py b/mlprodict/onnxrt/ops_cpu/op_where.py
index e7e20af2a..0b8c6f9dd 100644
--- a/mlprodict/onnxrt/ops_cpu/op_where.py
+++ b/mlprodict/onnxrt/ops_cpu/op_where.py
@@ -19,7 +19,7 @@ def _run(self, condition, x, y):  # pylint: disable=W0221
             raise RuntimeError(  # pragma: no cover
                 "x and y should share the same dtype {} != {}".format(
                     x.dtype, y.dtype))
-        if x.shape != y.shape:
+        if x.shape != y.shape and x.shape != (1, ) and y.shape != (1, ):
             raise RuntimeError(  # pragma: no cover
                 "x and y should share the same shape {} != {}".format(
                     x.shape, y.shape))
diff --git a/setup.py b/setup.py
index 31c2fc4b4..9dd2ace6e 100644
--- a/setup.py
+++ b/setup.py
@@ -95,6 +95,7 @@ def get_extensions():
     ext_max_pool = Extension(
         'mlprodict.onnxrt.ops_cpu.op_max_pool_',
         [os.path.join(root, 'mlprodict/onnxrt/ops_cpu/op_max_pool_.cpp'),
+         os.path.join(root, 'mlprodict/onnxrt/ops_cpu/op_conv_matrices_.cpp'),
          os.path.join(root, 'mlprodict/onnxrt/ops_cpu/op_common_.cpp'),
          os.path.join(root, 'mlprodict/onnxrt/ops_cpu/op_common_num_.cpp')],
         extra_compile_args=extra_compile_args,
@@ -255,6 +256,7 @@ def get_extensions():
     ext_conv = Extension(
         'mlprodict.onnxrt.ops_cpu.op_conv_',
         [os.path.join(root, 'mlprodict/onnxrt/ops_cpu/op_conv_.cpp'),
+         os.path.join(root, 'mlprodict/onnxrt/ops_cpu/op_conv_matrices_.cpp'),
          os.path.join(root, 'mlprodict/onnxrt/ops_cpu/op_common_.cpp')],
         extra_compile_args=extra_compile_args,
         extra_link_args=extra_link_args,
@@ -267,9 +269,27 @@ def get_extensions():
         define_macros=define_macros,
         language='c++')
 
+    ext_qlinearconv = Extension(
+        'mlprodict.onnxrt.ops_cpu.op_qlinear_conv_',
+        [os.path.join(root, 'mlprodict/onnxrt/ops_cpu/op_qlinear_conv_.cpp'),
+         os.path.join(root, 'mlprodict/onnxrt/ops_cpu/op_conv_matrices_.cpp'),
+         os.path.join(root, 'mlprodict/onnxrt/ops_cpu/op_common_.cpp')
+         ],
+        extra_compile_args=extra_compile_args,
+        extra_link_args=extra_link_args,
+        include_dirs=[
+            # Path to pybind11 headers
+            get_pybind_include(),
+            get_pybind_include(user=True),
+            os.path.join(root, 'mlprodict/onnxrt/ops_cpu')
+        ],
+        define_macros=define_macros,
+        language='c++')
+
     ext_conv_transpose = Extension(
         'mlprodict.onnxrt.ops_cpu.op_conv_transpose_',
         [os.path.join(root, 'mlprodict/onnxrt/ops_cpu/op_conv_transpose_.cpp'),
+         os.path.join(root, 'mlprodict/onnxrt/ops_cpu/op_conv_matrices_.cpp'),
          os.path.join(root, 'mlprodict/onnxrt/ops_cpu/op_common_.cpp')],
         extra_compile_args=extra_compile_args,
         extra_link_args=extra_link_args,
@@ -316,6 +336,7 @@ def get_extensions():
         ext_experimental_c,
         ext_gather,
         ext_max_pool,
+        ext_qlinearconv,
         ext_svm_classifier,
         ext_svm_regressor,
         ext_tfidfvectorizer,