diff --git a/README.rst b/README.rst index d3fd35197..24b612b09 100644 --- a/README.rst +++ b/README.rst @@ -7,8 +7,8 @@ mlprodict ========= -.. image:: https://travis-ci.org/sdpython/mlprodict.svg?branch=master - :target: https://travis-ci.org/sdpython/mlprodict +.. image:: https://travis-ci.com/sdpython/mlprodict.svg?branch=master + :target: https://travis-ci.com/sdpython/mlprodict :alt: Build status .. image:: https://ci.appveyor.com/api/projects/status/g8chk1ufyk1m8uep?svg=true diff --git a/_doc/sphinxdoc/source/conf.py b/_doc/sphinxdoc/source/conf.py index 40a8ae86f..6c179487f 100644 --- a/_doc/sphinxdoc/source/conf.py +++ b/_doc/sphinxdoc/source/conf.py @@ -2,7 +2,7 @@ import sys import os import alabaster -from pyquickhelper.helpgen.default_conf import set_sphinx_variables, get_default_stylesheet +from pyquickhelper.helpgen.default_conf import set_sphinx_variables from sklearn.experimental import enable_hist_gradient_boosting try: from mlprodict.onnx_conv import register_converters, register_rewritten_operators @@ -47,10 +47,7 @@ 'generate_visual_graphs', ]) -html_context = { - 'css_files': get_default_stylesheet([ - '_static/my-styles.css', '_static/gallery.css']), -} +html_css_files = ['my-styles.css'] html_logo = "phdoc_static/project_ico.png" diff --git a/_doc/sphinxdoc/source/index.rst b/_doc/sphinxdoc/source/index.rst index f86ab16c2..2e2a73a51 100644 --- a/_doc/sphinxdoc/source/index.rst +++ b/_doc/sphinxdoc/source/index.rst @@ -13,8 +13,8 @@ mlprodict :ref:`l-README`, :ref:`blog ` -.. image:: https://travis-ci.org/sdpython/mlprodict.svg?branch=master - :target: https://travis-ci.org/sdpython/mlprodict +.. image:: https://travis-ci.com/sdpython/mlprodict.svg?branch=master + :target: https://travis-ci.com/sdpython/mlprodict :alt: Build status .. image:: https://ci.appveyor.com/api/projects/status/g8chk1ufyk1m8uep?svg=true diff --git a/_unittests/ut_onnxrt/test_onnxrt_python_runtime_.py b/_unittests/ut_onnxrt/test_onnxrt_python_runtime_.py index ec8abc9cb..2b476e605 100644 --- a/_unittests/ut_onnxrt/test_onnxrt_python_runtime_.py +++ b/_unittests/ut_onnxrt/test_onnxrt_python_runtime_.py @@ -47,7 +47,7 @@ OnnxNeg, OnnxNot, OnnxOr, OnnxPad, OnnxPow, - OnnxQuantizeLinear, + OnnxQLinearConv, OnnxQuantizeLinear, OnnxReciprocal, OnnxReduceL1, OnnxReduceL2, OnnxReduceLogSumExp, OnnxReduceMax, OnnxReduceMean, OnnxReduceMin, @@ -2252,6 +2252,53 @@ def test_onnxt_runtime_pad2(self): def test_onnxt_runtime_pow(self): self.common_test_onnxt_runtime_binary(OnnxPow, numpy.power) + @wraplog() + def test_onnxt_runtime_qlinear_conv(self): + x = numpy.array( + [[255, 174, 162, 25, 203, 168, 58], + [15, 59, 237, 95, 129, 0, 64], + [56, 242, 153, 221, 168, 12, 166], + [232, 178, 186, 195, 237, 162, 237], + [188, 39, 124, 77, 80, 102, 43], + [127, 230, 21, 83, 41, 40, 134], + [255, 154, 92, 141, 42, 148, 247], ], + dtype=numpy.uint8).reshape((1, 1, 7, 7)) + + x_scale = numpy.float32(0.00369204697) + x_zero_point = numpy.uint8(132) + + w = numpy.array([0], dtype=numpy.uint8).reshape((1, 1, 1, 1)) + + w_scale = numpy.array([0.00172794575], dtype=numpy.float32) + w_zero_point = numpy.array([255], dtype=numpy.uint8) + + y_scale = numpy.float32(0.00162681262) + y_zero_point = numpy.uint8(123) + + output = numpy.array( + [[0, 81, 93, 230, 52, 87, 197], + [240, 196, 18, 160, 126, 255, 191], + [199, 13, 102, 34, 87, 243, 89], + [23, 77, 69, 60, 18, 93, 18], + [67, 216, 131, 178, 175, 153, 212], + [128, 25, 234, 172, 214, 215, 121], + [0, 101, 163, 114, 213, 107, 8], ], + dtype=numpy.uint8).reshape((1, 1, 7, 7)) + + node = OnnxQLinearConv('x', 'x_scale', 'x_zero_point', 'w', + 'w_scale', 'w_zero_point', 'y_scale', + 'y_zero_point', output_names=['y'], + op_version=get_opset_number_from_onnx()) + inputs = {'x': x, 'x_scale': x_scale, 'x_zero_point': x_zero_point, + 'w': w, 'w_scale': w_scale, 'w_zero_point': w_zero_point, + 'y_scale': y_scale, 'y_zero_point': y_zero_point} + model_def = node.to_onnx(inputs, + target_opset=get_opset_number_from_onnx()) + oinf = OnnxInference(model_def) + got = oinf.run(inputs)['y'] + self.assertEqualArray(output, got) + python_tested.append(OnnxQLinearConv) + @wraplog() def test_onnxt_runtime_quantize_linear(self): X = numpy.array([[[[-162, 10], [-100, 232], [-20, -50]], @@ -3507,5 +3554,5 @@ def test_make_constant(self): if __name__ == "__main__": - # TestOnnxrtPythonRuntime().test_onnxt_runtime_cast_in() + # TestOnnxrtPythonRuntime().test_onnxt_runtime_qlinear_conv() unittest.main() diff --git a/bin/debug/debug.cpp b/bin/debug/debug.cpp new file mode 100644 index 000000000..4ec7112a6 --- /dev/null +++ b/bin/debug/debug.cpp @@ -0,0 +1,59 @@ +// debug.cpp : Ce fichier contient la fonction 'main'. L'exécution du programme commence et se termine à cet endroit. +// + +#include "op_qlinear_conv_.hpp" + + +void test_qlinear_conv() { + QLinearConv conv; + + const std::string auto_pad = "NOTSET"; + py_array_t dilations; + py_array_t kernel_shape; + py_array_t pads; + py_array_t strides; + + conv.init(auto_pad, dilations, 1, kernel_shape, pads, strides); + + py_array_t x({ + 255, 174, 162, 25, 203, 168, 58, + 15, 59, 237, 95, 129, 0, 64, + 56, 242, 153, 221, 168, 12, 166, + 232, 178, 186, 195, 237, 162, 237, + 188, 39, 124, 77, 80, 102, 43, + 127, 230, 21, 83, 41, 40, 134, + 255, 154, 92, 141, 42, 148, 247 }, + { 1, 1, 7, 7 }); + + float x_scale = (float)0.00369204697; + uint8_t x_zero_point = 132; + + py_array_t w({ 0 }, { 1, 1, 1, 1 }); + + py_array_t w_scale({ (float)0.00172794575 }, {}); + uint8_t w_zero_point = 255; + + float y_scale = (float)0.00162681262; + uint8_t y_zero_point = 123; + + py_array_t B; + + py_array_t output({ + 0, 81, 93, 230, 52, 87, 197, + 240, 196, 18, 160, 126, 255, 191, + 199, 13, 102, 34, 87, 243, 89, + 23, 77, 69, 60, 18, 93, 18, + 67, 216, 131, 178, 175, 153, 212, + 128, 25, 234, 172, 214, 215, 121, + 0, 101, 163, 114, 213, 107, 8 }, + { 1, 1, 7, 7 }); + py_array_t < uint8_t> res = conv.compute( + x, x_scale, x_zero_point, w, w_scale, w_zero_point, y_scale, y_zero_point, B); + if (!res.equal(output)) + throw std::runtime_error("failed"); +} + + +int main() { + test_qlinear_conv(); +} diff --git a/bin/debug/debug.sln b/bin/debug/debug.sln new file mode 100644 index 000000000..e2ffbb37f --- /dev/null +++ b/bin/debug/debug.sln @@ -0,0 +1,31 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 16 +VisualStudioVersion = 16.0.31321.278 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "debug", "debug.vcxproj", "{714F6D8B-15D6-4E4E-940C-CC1AFA599EF1}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Debug|x86 = Debug|x86 + Release|x64 = Release|x64 + Release|x86 = Release|x86 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {714F6D8B-15D6-4E4E-940C-CC1AFA599EF1}.Debug|x64.ActiveCfg = Debug|x64 + {714F6D8B-15D6-4E4E-940C-CC1AFA599EF1}.Debug|x64.Build.0 = Debug|x64 + {714F6D8B-15D6-4E4E-940C-CC1AFA599EF1}.Debug|x86.ActiveCfg = Debug|Win32 + {714F6D8B-15D6-4E4E-940C-CC1AFA599EF1}.Debug|x86.Build.0 = Debug|Win32 + {714F6D8B-15D6-4E4E-940C-CC1AFA599EF1}.Release|x64.ActiveCfg = Release|x64 + {714F6D8B-15D6-4E4E-940C-CC1AFA599EF1}.Release|x64.Build.0 = Release|x64 + {714F6D8B-15D6-4E4E-940C-CC1AFA599EF1}.Release|x86.ActiveCfg = Release|Win32 + {714F6D8B-15D6-4E4E-940C-CC1AFA599EF1}.Release|x86.Build.0 = Release|Win32 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {CC1B1913-0A0E-45AE-827F-C6D97913A40D} + EndGlobalSection +EndGlobal diff --git a/bin/debug/debug.vcxproj b/bin/debug/debug.vcxproj new file mode 100644 index 000000000..aa5de2aad --- /dev/null +++ b/bin/debug/debug.vcxproj @@ -0,0 +1,154 @@ + + + + + Debug + Win32 + + + Release + Win32 + + + Debug + x64 + + + Release + x64 + + + + 16.0 + Win32Proj + {714f6d8b-15d6-4e4e-940c-cc1afa599ef1} + debug + 10.0 + + + + Application + true + v142 + Unicode + + + Application + false + v142 + true + Unicode + + + Application + true + v142 + Unicode + + + Application + false + v142 + true + Unicode + + + + + + + + + + + + + + + + + + + + + true + + + false + + + true + + + false + + + + Level3 + true + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + + + Console + true + + + + + Level3 + true + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + + + Console + true + true + true + + + + + Level3 + true + _DEBUG;_CONSOLE;%(PreprocessorDefinitions);SKIP_PYTHON + true + ../../mlprodict/onnxrt/ops_cpu + + + Console + true + + + + + Level3 + true + true + true + NDEBUG;_CONSOLE;%(PreprocessorDefinitions);SKIP_PYTHON + true + + + Console + true + true + true + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/mlprodict/onnxrt/ops_cpu/_op_list.py b/mlprodict/onnxrt/ops_cpu/_op_list.py index ad6dfa1be..e0dbb8209 100644 --- a/mlprodict/onnxrt/ops_cpu/_op_list.py +++ b/mlprodict/onnxrt/ops_cpu/_op_list.py @@ -78,6 +78,7 @@ from .op_pad import Pad from .op_pow import Pow from .op_quantize_linear import QuantizeLinear +from .op_qlinear_conv import QLinearConv from .op_reciprocal import Reciprocal from .op_reduce_log_sum_exp import ReduceLogSumExp from .op_reduce_l1 import ReduceL1 diff --git a/mlprodict/onnxrt/ops_cpu/op_common_.cpp b/mlprodict/onnxrt/ops_cpu/op_common_.cpp index df53a5207..cc968637c 100644 --- a/mlprodict/onnxrt/ops_cpu/op_common_.cpp +++ b/mlprodict/onnxrt/ops_cpu/op_common_.cpp @@ -7,136 +7,132 @@ #include // realloc -POST_EVAL_TRANSFORM to_POST_EVAL_TRANSFORM(const std::string &value) { - if (value.compare("NONE") == 0) return POST_EVAL_TRANSFORM::NONE; - if (value.compare("LOGISTIC") == 0) return POST_EVAL_TRANSFORM::LOGISTIC; - if (value.compare("SOFTMAX") == 0) return POST_EVAL_TRANSFORM::SOFTMAX; - if (value.compare("SOFTMAX_ZERO") == 0) return POST_EVAL_TRANSFORM::SOFTMAX_ZERO; - if (value.compare("PROBIT") == 0) return POST_EVAL_TRANSFORM::PROBIT; - throw std::runtime_error(std::string("NODE_MODE '") + - value + - std::string("' is not defined.")); +POST_EVAL_TRANSFORM to_POST_EVAL_TRANSFORM(const std::string& value) { + if (value.compare("NONE") == 0) return POST_EVAL_TRANSFORM::NONE; + if (value.compare("LOGISTIC") == 0) return POST_EVAL_TRANSFORM::LOGISTIC; + if (value.compare("SOFTMAX") == 0) return POST_EVAL_TRANSFORM::SOFTMAX; + if (value.compare("SOFTMAX_ZERO") == 0) return POST_EVAL_TRANSFORM::SOFTMAX_ZERO; + if (value.compare("PROBIT") == 0) return POST_EVAL_TRANSFORM::PROBIT; + throw std::runtime_error(std::string("NODE_MODE '") + + value + + std::string("' is not defined.")); } -NODE_MODE to_NODE_MODE(const std::string &value) { - if (value.compare("BRANCH_LEQ") == 0) return NODE_MODE::BRANCH_LEQ; - if (value.compare("BRANCH_LT") == 0) return NODE_MODE::BRANCH_LT; - if (value.compare("BRANCH_GTE") == 0) return NODE_MODE::BRANCH_GTE; - if (value.compare("BRANCH_GT") == 0) return NODE_MODE::BRANCH_GT; - if (value.compare("BRANCH_EQ") == 0) return NODE_MODE::BRANCH_EQ; - if (value.compare("BRANCH_NEQ") == 0) return NODE_MODE::BRANCH_NEQ; - if (value.compare("LEAF") == 0) return NODE_MODE::LEAF; - throw std::runtime_error(std::string("NODE_MODE '") + - value + - std::string("' is not defined.")); +NODE_MODE to_NODE_MODE(const std::string& value) { + if (value.compare("BRANCH_LEQ") == 0) return NODE_MODE::BRANCH_LEQ; + if (value.compare("BRANCH_LT") == 0) return NODE_MODE::BRANCH_LT; + if (value.compare("BRANCH_GTE") == 0) return NODE_MODE::BRANCH_GTE; + if (value.compare("BRANCH_GT") == 0) return NODE_MODE::BRANCH_GT; + if (value.compare("BRANCH_EQ") == 0) return NODE_MODE::BRANCH_EQ; + if (value.compare("BRANCH_NEQ") == 0) return NODE_MODE::BRANCH_NEQ; + if (value.compare("LEAF") == 0) return NODE_MODE::LEAF; + throw std::runtime_error(std::string("NODE_MODE '") + + value + + std::string("' is not defined.")); } -const char * to_str(NODE_MODE mode) { - switch(mode) { - case NODE_MODE::BRANCH_LEQ: return "BRANCH_LEQ"; - case NODE_MODE::BRANCH_LT: return "BRANCH_LT"; - case NODE_MODE::BRANCH_GTE: return "BRANCH_GTE"; - case NODE_MODE::BRANCH_GT: return "BRANCH_GT"; - case NODE_MODE::BRANCH_EQ: return "BRANCH_EQ"; - case NODE_MODE::BRANCH_NEQ: return "BRANCH_NEQ"; - case NODE_MODE::LEAF: return "LEAF"; - default: return "?"; - } +const char* to_str(NODE_MODE mode) { + switch (mode) { + case NODE_MODE::BRANCH_LEQ: return "BRANCH_LEQ"; + case NODE_MODE::BRANCH_LT: return "BRANCH_LT"; + case NODE_MODE::BRANCH_GTE: return "BRANCH_GTE"; + case NODE_MODE::BRANCH_GT: return "BRANCH_GT"; + case NODE_MODE::BRANCH_EQ: return "BRANCH_EQ"; + case NODE_MODE::BRANCH_NEQ: return "BRANCH_NEQ"; + case NODE_MODE::LEAF: return "LEAF"; + default: return "?"; + } } -SVM_TYPE to_SVM_TYPE(const std::string &value) { - if (value.compare("SVM_LINEAR") == 0) return SVM_TYPE::SVM_LINEAR; - if (value.compare("SVM_SVC") == 0) return SVM_TYPE::SVM_SVC; - throw std::runtime_error(std::string("SVM_TYPE '") + - value + - std::string("' is not defined.")); +SVM_TYPE to_SVM_TYPE(const std::string& value) { + if (value.compare("SVM_LINEAR") == 0) return SVM_TYPE::SVM_LINEAR; + if (value.compare("SVM_SVC") == 0) return SVM_TYPE::SVM_SVC; + throw std::runtime_error(std::string("SVM_TYPE '") + + value + + std::string("' is not defined.")); } -KERNEL to_KERNEL(const std::string &value) { - if (value.compare("LINEAR") == 0) return KERNEL::LINEAR; - if (value.compare("POLY") == 0) return KERNEL::POLY; - if (value.compare("RBF") == 0) return KERNEL::RBF; - if (value.compare("SIGMOID") == 0) return KERNEL::SIGMOID; - throw std::runtime_error(std::string("KERNEL '") + - value + - std::string("' is not defined.")); +KERNEL to_KERNEL(const std::string& value) { + if (value.compare("LINEAR") == 0) return KERNEL::LINEAR; + if (value.compare("POLY") == 0) return KERNEL::POLY; + if (value.compare("RBF") == 0) return KERNEL::RBF; + if (value.compare("SIGMOID") == 0) return KERNEL::SIGMOID; + throw std::runtime_error(std::string("KERNEL '") + + value + + std::string("' is not defined.")); } AGGREGATE_FUNCTION to_AGGREGATE_FUNCTION(const std::string& input) { - if (input == "AVERAGE") return AGGREGATE_FUNCTION::AVERAGE; - if (input == "SUM") return AGGREGATE_FUNCTION::SUM; - if (input == "MIN") return AGGREGATE_FUNCTION::MIN; - if (input == "MAX") return AGGREGATE_FUNCTION::MAX; - throw std::runtime_error(std::string("AGGREGATE_FUNCTION '") + - input + - std::string("' is not defined.")); + if (input == "AVERAGE") return AGGREGATE_FUNCTION::AVERAGE; + if (input == "SUM") return AGGREGATE_FUNCTION::SUM; + if (input == "MIN") return AGGREGATE_FUNCTION::MIN; + if (input == "MAX") return AGGREGATE_FUNCTION::MAX; + throw std::runtime_error(std::string("AGGREGATE_FUNCTION '") + + input + + std::string("' is not defined.")); } StorageOrder to_StorageOrder(const std::string& input) { - if (input == "UNKNOWN") return StorageOrder::UNKNOWN; - if (input == "NHWC") return StorageOrder::NHWC; - if (input == "NCHW") return StorageOrder::NCHW; - throw std::runtime_error(std::string("StorageOrder '") + - input + - std::string("' is not defined.")); + if (input == "UNKNOWN") return StorageOrder::UNKNOWN; + if (input == "NHWC") return StorageOrder::NHWC; + if (input == "NCHW") return StorageOrder::NCHW; + throw std::runtime_error(std::string("StorageOrder '") + + input + + std::string("' is not defined.")); } -AutoPadType to_AutoPadType(const std::string &input) { - if (input == "NOTSET") return AutoPadType::NOTSET; - if (input == "VALID") return AutoPadType::VALID; - if (input == "SAME_UPPER") return AutoPadType::SAME_UPPER; - if (input == "SAME_LOWER") return AutoPadType::SAME_LOWER; - throw std::runtime_error(std::string("AutoPadType '") + - input + - std::string("' is not defined.")); +AutoPadType to_AutoPadType(const std::string& input) { + if (input == "NOTSET") return AutoPadType::NOTSET; + if (input == "VALID") return AutoPadType::VALID; + if (input == "SAME_UPPER") return AutoPadType::SAME_UPPER; + if (input == "SAME_LOWER") return AutoPadType::SAME_LOWER; + throw std::runtime_error(std::string("AutoPadType '") + + input + + std::string("' is not defined.")); } -void debug_print(const std::string &msg, int64_t iter, int64_t end) { - std::cout << msg.c_str() << ":" << iter << "/" << end << "\n"; +void debug_print(const std::string& msg, int64_t iter, int64_t end) { + std::cout << msg.c_str() << ":" << iter << "/" << end << "\n"; } -void debug_print(const std::string &msg, size_t i, size_t j, size_t k, float pa, float pb, float val) { - std::cout << msg.c_str() << ": (" << i << "," << j << "," << k << "): " << pa << "," << pb << " -> " << val << "\n"; +void debug_print(const std::string& msg, size_t i, size_t j, size_t k, float pa, float pb, float val) { + std::cout << msg.c_str() << ": (" << i << "," << j << "," << k << "): " << pa << "," << pb << " -> " << val << "\n"; } -void debug_print(const std::string &msg, size_t i, size_t j, size_t k, double pa, double pb, double val) { - std::cout << msg.c_str() << ": (" << i << "," << j << "," << k << "): " << pa << "," << pb << " -> " << val << "\n"; +void debug_print(const std::string& msg, size_t i, size_t j, size_t k, double pa, double pb, double val) { + std::cout << msg.c_str() << ": (" << i << "," << j << "," << k << "): " << pa << "," << pb << " -> " << val << "\n"; } template void debug_print_(const std::string& msg, T value) { - std::cout << msg.c_str() << ":" << value << "\n"; + std::cout << msg.c_str() << ":" << value << "\n"; } void debug_print(const std::string& msg, float value) { - debug_print_(msg.c_str(), value); + debug_print_(msg.c_str(), value); } void debug_print(const std::string& msg, double value) { - debug_print_(msg.c_str(), value); + debug_print_(msg.c_str(), value); } void debug_print(const std::string& msg, int64_t value) { - debug_print_(msg.c_str(), value); + debug_print_(msg.c_str(), value); } void debug_print(const std::string& msg, size_t value) { - debug_print_(msg.c_str(), value); + debug_print_(msg.c_str(), value); } - - - - diff --git a/mlprodict/onnxrt/ops_cpu/op_common_.hpp b/mlprodict/onnxrt/ops_cpu/op_common_.hpp index 869b0d6f0..972c8c758 100644 --- a/mlprodict/onnxrt/ops_cpu/op_common_.hpp +++ b/mlprodict/onnxrt/ops_cpu/op_common_.hpp @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include @@ -22,10 +23,10 @@ inline bool _isnan_(double x) { return ::isnan(x); } // See https://stackoverflow.com/questions/2249110/how-do-i-make-a-portable-isnan-isinf-function inline bool _isnan_(double x) { - union { uint64_t u; double f; } ieee754; - ieee754.f = x; - return ( (unsigned)(ieee754.u >> 32) & 0x7fffffff ) + - ( (unsigned)ieee754.u != 0 ) > 0x7ff00000; + union { uint64_t u; double f; } ieee754; + ieee754.f = x; + return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) + + ((unsigned)ieee754.u != 0) > 0x7ff00000; } inline bool _isnan_(float x) { return _isnan_((double)x); } @@ -34,111 +35,111 @@ inline bool _isnan_(float x) { return _isnan_((double)x); } enum class POST_EVAL_TRANSFORM { - NONE, - LOGISTIC, - SOFTMAX, - SOFTMAX_ZERO, - PROBIT + NONE, + LOGISTIC, + SOFTMAX, + SOFTMAX_ZERO, + PROBIT }; -POST_EVAL_TRANSFORM to_POST_EVAL_TRANSFORM(const std::string &value); +POST_EVAL_TRANSFORM to_POST_EVAL_TRANSFORM(const std::string& value); enum class NODE_MODE { - BRANCH_LEQ, - BRANCH_LT, - BRANCH_GTE, - BRANCH_GT, - BRANCH_EQ, - BRANCH_NEQ, - LEAF + BRANCH_LEQ, + BRANCH_LT, + BRANCH_GTE, + BRANCH_GT, + BRANCH_EQ, + BRANCH_NEQ, + LEAF }; -NODE_MODE to_NODE_MODE(const std::string &value); +NODE_MODE to_NODE_MODE(const std::string& value); -const char * to_str(NODE_MODE mode); +const char* to_str(NODE_MODE mode); enum class AGGREGATE_FUNCTION { - AVERAGE, - SUM, - MIN, - MAX + AVERAGE, + SUM, + MIN, + MAX }; AGGREGATE_FUNCTION to_AGGREGATE_FUNCTION(const std::string& input); enum class SVM_TYPE { - SVM_LINEAR, - SVM_SVC + SVM_LINEAR, + SVM_SVC }; -SVM_TYPE to_SVM_TYPE(const std::string &value); +SVM_TYPE to_SVM_TYPE(const std::string& value); enum KERNEL { - LINEAR, - POLY, - RBF, - SIGMOID + LINEAR, + POLY, + RBF, + SIGMOID }; -KERNEL to_KERNEL(const std::string &value); +KERNEL to_KERNEL(const std::string& value); enum StorageOrder { - UNKNOWN = 0, - NHWC = 1, - NCHW = 2, + UNKNOWN = 0, + NHWC = 1, + NCHW = 2, }; -StorageOrder to_StorageOrder(const std::string &value); +StorageOrder to_StorageOrder(const std::string& value); enum class AutoPadType { - NOTSET = 0, - VALID = 1, - SAME_UPPER = 2, - SAME_LOWER = 3, + NOTSET = 0, + VALID = 1, + SAME_UPPER = 2, + SAME_LOWER = 3, }; -AutoPadType to_AutoPadType(const std::string &value); +AutoPadType to_AutoPadType(const std::string& value); static inline float ErfInv(float x) { - float sgn = x < 0 ? -1.0f : 1.0f; - x = (1 - x) * (1 + x); - float log = std::log(x); - float v = 2 / (3.14159f * 0.147f) + 0.5f * log; - float v2 = 1 / (0.147f) * log; - float v3 = -v + std::sqrt(v * v - v2); - x = sgn * std::sqrt(v3); - return x; + float sgn = x < 0 ? -1.0f : 1.0f; + x = (1 - x) * (1 + x); + float log = std::log(x); + float v = 2 / (3.14159f * 0.147f) + 0.5f * log; + float v2 = 1 / (0.147f) * log; + float v3 = -v + std::sqrt(v * v - v2); + x = sgn * std::sqrt(v3); + return x; } static inline double ErfInv(double x) { - double sgn = x < 0 ? -1.0 : 1.0; - x = (1 - x) * (1 + x); - double log = std::log(x); - double v = 2. / (3.14159f * 0.147f) + 0.5f * log; - double v2 = 1. / (0.147f) * log; - double v3 = std::sqrt(v * v - v2) - v; - return sgn * std::sqrt(v3); + double sgn = x < 0 ? -1.0 : 1.0; + x = (1 - x) * (1 + x); + double log = std::log(x); + double v = 2. / (3.14159f * 0.147f) + 0.5f * log; + double v2 = 1. / (0.147f) * log; + double v3 = std::sqrt(v * v - v2) - v; + return sgn * std::sqrt(v3); } static inline float ComputeLogistic(float val) { - float v = 1 / (1 + std::exp(-std::abs(val))); - return (val < 0) ? (1 - v) : v; + float v = 1 / (1 + std::exp(-std::abs(val))); + return (val < 0) ? (1 - v) : v; } static inline double ComputeLogistic(double val) { - double v = 1. / (1. + std::exp(-std::abs(val))); - return (val < 0) ? (1. - v) : v; + double v = 1. / (1. + std::exp(-std::abs(val))); + return (val < 0) ? (1. - v) : v; } @@ -147,180 +148,181 @@ static const float ml_sqrt2 = 1.41421356f; template static inline NTYPE ComputeProbit(NTYPE val) { - return ml_sqrt2 * ErfInv(val * 2 - 1); + return ml_sqrt2 * ErfInv(val * 2 - 1); } template static inline NTYPE sigmoid_probability(NTYPE score, NTYPE proba, NTYPE probb) { - NTYPE val = score * proba + probb; - return 1 - ComputeLogistic(val); // ref: https://github.com/arnaudsj/libsvm/blob/eaaefac5ebd32d0e07902e1ae740e038eaaf0826/svm.cpp#L1818 + NTYPE val = score * proba + probb; + return 1 - ComputeLogistic(val); // ref: https://github.com/arnaudsj/libsvm/blob/eaaefac5ebd32d0e07902e1ae740e038eaaf0826/svm.cpp#L1818 } template void ComputeSoftmax(NTYPE* begin, NTYPE* end) { - NTYPE v_max = -std::numeric_limits::max(); - NTYPE* it; - for (it = begin; it != end; ++it) { - if (*it > v_max) - v_max = *it; - } - NTYPE this_sum = (NTYPE)0.; - for (it = begin; it != end; ++it) { - *it = std::exp(*it - v_max); - this_sum += *it; - } - for (it = begin; it != end; ++it) - *it /= this_sum; + NTYPE v_max = -std::numeric_limits::max(); + NTYPE* it; + for (it = begin; it != end; ++it) { + if (*it > v_max) + v_max = *it; + } + NTYPE this_sum = (NTYPE)0.; + for (it = begin; it != end; ++it) { + *it = std::exp(*it - v_max); + this_sum += *it; + } + for (it = begin; it != end; ++it) + *it /= this_sum; } template void ComputeSoftmax(std::vector& values) { - ComputeSoftmax(values.data(), values.data() + values.size()); + ComputeSoftmax(values.data(), values.data() + values.size()); } template void ComputeSoftmaxZero(NTYPE* begin, NTYPE* end) { - NTYPE v_max = -std::numeric_limits::max(); - NTYPE* it; - for (it = begin; it != end; ++it) { - if (*it > v_max) - v_max = *it; - } - NTYPE exp_neg_v_max = std::exp(-v_max); - NTYPE this_sum = (NTYPE)0; - for (it = begin; it != end; ++it) { - if (*it > 0.0000001f || *it < -0.0000001f) { - *it = std::exp(*it - v_max); - this_sum += *it; - } else { - *it *= exp_neg_v_max; - } - } - for (it = begin; it != end; ++it) - *it /= this_sum; + NTYPE v_max = -std::numeric_limits::max(); + NTYPE* it; + for (it = begin; it != end; ++it) { + if (*it > v_max) + v_max = *it; + } + NTYPE exp_neg_v_max = std::exp(-v_max); + NTYPE this_sum = (NTYPE)0; + for (it = begin; it != end; ++it) { + if (*it > 0.0000001f || *it < -0.0000001f) { + *it = std::exp(*it - v_max); + this_sum += *it; + } + else { + *it *= exp_neg_v_max; + } + } + for (it = begin; it != end; ++it) + *it /= this_sum; } template void ComputeSoftmaxZero(std::vector& values) { - ComputeSoftmaxZero(values.data(), values.data() + values.size()); + ComputeSoftmaxZero(values.data(), values.data() + values.size()); } template size_t write_scores(std::vector& scores, POST_EVAL_TRANSFORM post_transform, - NTYPE* Z, int add_second_class) { - if ((scores.size() == 1) && add_second_class) { - scores.push_back(0); - return write_scores(1, scores.data(), post_transform, Z, add_second_class); - } - return write_scores(scores.size(), scores.data(), post_transform, Z, add_second_class); + NTYPE* Z, int add_second_class) { + if ((scores.size() == 1) && add_second_class) { + scores.push_back(0); + return write_scores(1, scores.data(), post_transform, Z, add_second_class); + } + return write_scores(scores.size(), scores.data(), post_transform, Z, add_second_class); } template size_t write_scores(size_t n_classes, NTYPE* scores, POST_EVAL_TRANSFORM post_transform, - NTYPE* Z, int add_second_class) { - if (n_classes >= 2) { - NTYPE * end = scores + n_classes; - switch (post_transform) { - case POST_EVAL_TRANSFORM::PROBIT: - for(auto it = scores; it != end; ++it, ++Z) - *Z = ComputeProbit(*it); - break; - case POST_EVAL_TRANSFORM::LOGISTIC: - for(auto it = scores; it != end; ++it, ++Z) - *Z = ComputeLogistic(*it); - break; - case POST_EVAL_TRANSFORM::SOFTMAX: - ComputeSoftmax(scores, end); - memcpy(Z, scores, n_classes * sizeof(NTYPE)); - break; - case POST_EVAL_TRANSFORM::SOFTMAX_ZERO: - ComputeSoftmaxZero(scores, end); - memcpy(Z, scores, n_classes * sizeof(NTYPE)); - break; - default: - case POST_EVAL_TRANSFORM::NONE: - memcpy(Z, scores, n_classes * sizeof(NTYPE)); - break; - } - } - else if (n_classes == 1) { //binary case - if (post_transform == POST_EVAL_TRANSFORM::PROBIT) { - scores[0] = ComputeProbit(scores[0]); - *Z = scores[0]; - } - else { - switch (add_second_class) { - case 0: //0=all positive weights, winning class is positive - scores[1] = scores[0]; - scores[0] = 1.f - scores[0]; //put opposite score in positive slot - *Z = scores[0]; - *(Z+1) = scores[1]; - ++n_classes; - break; - case 1: //1 = all positive weights, winning class is negative - scores[1] = scores[0]; - scores[0] = 1.f - scores[0]; //put opposite score in positive slot - *Z = scores[0]; - *(Z+1) = scores[1]; - ++n_classes; - break; - case 2: - case 3: //2 = mixed weights, winning class is positive - if (post_transform == POST_EVAL_TRANSFORM::LOGISTIC) { - scores[1] = ComputeLogistic(scores[0]); //ml_logit(scores[k]); - scores[0] = ComputeLogistic(-scores[0]); - } - else { - scores[1] = scores[0]; - scores[0] = -scores[0]; - } - *Z = scores[0]; - *(Z+1) = scores[1]; - ++n_classes; - break; - default: - *Z = scores[0]; - break; - } - } - } - return n_classes; + NTYPE* Z, int add_second_class) { + if (n_classes >= 2) { + NTYPE* end = scores + n_classes; + switch (post_transform) { + case POST_EVAL_TRANSFORM::PROBIT: + for (auto it = scores; it != end; ++it, ++Z) + *Z = ComputeProbit(*it); + break; + case POST_EVAL_TRANSFORM::LOGISTIC: + for (auto it = scores; it != end; ++it, ++Z) + *Z = ComputeLogistic(*it); + break; + case POST_EVAL_TRANSFORM::SOFTMAX: + ComputeSoftmax(scores, end); + memcpy(Z, scores, n_classes * sizeof(NTYPE)); + break; + case POST_EVAL_TRANSFORM::SOFTMAX_ZERO: + ComputeSoftmaxZero(scores, end); + memcpy(Z, scores, n_classes * sizeof(NTYPE)); + break; + default: + case POST_EVAL_TRANSFORM::NONE: + memcpy(Z, scores, n_classes * sizeof(NTYPE)); + break; + } + } + else if (n_classes == 1) { //binary case + if (post_transform == POST_EVAL_TRANSFORM::PROBIT) { + scores[0] = ComputeProbit(scores[0]); + *Z = scores[0]; + } + else { + switch (add_second_class) { + case 0: //0=all positive weights, winning class is positive + scores[1] = scores[0]; + scores[0] = 1.f - scores[0]; //put opposite score in positive slot + *Z = scores[0]; + *(Z + 1) = scores[1]; + ++n_classes; + break; + case 1: //1 = all positive weights, winning class is negative + scores[1] = scores[0]; + scores[0] = 1.f - scores[0]; //put opposite score in positive slot + *Z = scores[0]; + *(Z + 1) = scores[1]; + ++n_classes; + break; + case 2: + case 3: //2 = mixed weights, winning class is positive + if (post_transform == POST_EVAL_TRANSFORM::LOGISTIC) { + scores[1] = ComputeLogistic(scores[0]); //ml_logit(scores[k]); + scores[0] = ComputeLogistic(-scores[0]); + } + else { + scores[1] = scores[0]; + scores[0] = -scores[0]; + } + *Z = scores[0]; + *(Z + 1) = scores[1]; + ++n_classes; + break; + default: + *Z = scores[0]; + break; + } + } + } + return n_classes; } template size_t write_scores2(NTYPE* scores, POST_EVAL_TRANSFORM post_transform, - NTYPE* Z, int add_second_class) { - switch (post_transform) { - case POST_EVAL_TRANSFORM::PROBIT: - Z[0] = ComputeProbit(scores[0]); - Z[1] = ComputeProbit(scores[1]); - break; - case POST_EVAL_TRANSFORM::LOGISTIC: - Z[0] = ComputeLogistic(scores[0]); - Z[1] = ComputeLogistic(scores[1]); - break; - case POST_EVAL_TRANSFORM::SOFTMAX: - ComputeSoftmax(scores, scores + 2); - memcpy(Z, scores, 2 * sizeof(NTYPE)); - break; - case POST_EVAL_TRANSFORM::SOFTMAX_ZERO: - ComputeSoftmaxZero(scores, scores + 2); - memcpy(Z, scores, 2 * sizeof(NTYPE)); - break; - default: - case POST_EVAL_TRANSFORM::NONE: - memcpy(Z, scores, 2 * sizeof(NTYPE)); - break; - } - return 2; + NTYPE* Z, int add_second_class) { + switch (post_transform) { + case POST_EVAL_TRANSFORM::PROBIT: + Z[0] = ComputeProbit(scores[0]); + Z[1] = ComputeProbit(scores[1]); + break; + case POST_EVAL_TRANSFORM::LOGISTIC: + Z[0] = ComputeLogistic(scores[0]); + Z[1] = ComputeLogistic(scores[1]); + break; + case POST_EVAL_TRANSFORM::SOFTMAX: + ComputeSoftmax(scores, scores + 2); + memcpy(Z, scores, 2 * sizeof(NTYPE)); + break; + case POST_EVAL_TRANSFORM::SOFTMAX_ZERO: + ComputeSoftmaxZero(scores, scores + 2); + memcpy(Z, scores, 2 * sizeof(NTYPE)); + break; + default: + case POST_EVAL_TRANSFORM::NONE: + memcpy(Z, scores, 2 * sizeof(NTYPE)); + break; + } + return 2; } @@ -344,74 +346,74 @@ size_t write_scores2(NTYPE* scores, POST_EVAL_TRANSFORM post_transform, template NTYPE flattened_dimension(const std::vector& values) { - NTYPE r = 1; - for(auto it = values.begin(); it != values.end(); ++it) - r *= *it; - return r; + NTYPE r = 1; + for (auto it = values.begin(); it != values.end(); ++it) + r *= *it; + return r; } template NTYPE flattened_dimension(const std::vector& values, int64_t first) { - NTYPE r = 1; - auto end = values.begin() + first; - for(auto it = values.begin(); it != end; ++it) - r *= *it; - return r; + NTYPE r = 1; + auto end = values.begin() + first; + for (auto it = values.begin(); it != end; ++it) + r *= *it; + return r; } template -void shape2strides(const std::vector& shape, - std::vector& strides, NTYPE cst) { - strides.resize(shape.size()); - strides[strides.size()-1] = sizeof(NTYPE); - for(ssize_t i = strides.size()-2; i >= 0; --i) - strides[i] = strides[i+1] * shape[i+1]; +void shape2strides(const std::vector& shape, + std::vector& strides, NTYPE cst) { + strides.resize(shape.size()); + strides[strides.size() - 1] = sizeof(NTYPE); + for (int64_t i = (int64_t)strides.size() - 2; i >= 0; --i) + strides[i] = strides[i + 1] * shape[i + 1]; } template DIMTYPE SizeFromDimension(const std::vector& shape, size_t start, size_t end) { - DIMTYPE size = 1; - for (size_t i = start; i < end; i++) { - if (shape[i] < 0) - return -1; - size *= shape[i]; - } - return size; + DIMTYPE size = 1; + for (size_t i = start; i < end; i++) { + if (shape[i] < 0) + return -1; + size *= shape[i]; + } + return size; } template constexpr T roundUpPow2(T a) { - return (a + (b - 1)) & (~(b - 1)); + return (a + (b - 1)) & (~(b - 1)); } inline int64_t HandleNegativeAxis(int64_t axis, int64_t tensor_rank) { - return axis < 0 ? axis + tensor_rank : axis; + return axis < 0 ? axis + tensor_rank : axis; } template void debug_print(const std::string& msg, size_t size, const T* value) { - std::cout << msg << " - size:" << size << " :: "; - size_t i = size > 10 ? 10 : size; - for (size_t j = 0; j < i; ++j) - std::cout << value[j] << " "; - std::cout << "\n"; + std::cout << msg << " - size:" << size << " :: "; + size_t i = size > 10 ? 10 : size; + for (size_t j = 0; j < i; ++j) + std::cout << value[j] << " "; + std::cout << "\n"; } template void debug_print(const std::string& msg, const std::vector& value) { - auto size = value.size(); - std::cout << msg << " - size:" << size << " :: "; - size_t i = size > 10 ? 10 : size; - for (size_t j = 0; j < i; ++j) - std::cout << value[j] << " "; - std::cout << "\n"; + auto size = value.size(); + std::cout << msg << " - size:" << size << " :: "; + size_t i = size > 10 ? 10 : size; + for (size_t j = 0; j < i; ++j) + std::cout << value[j] << " "; + std::cout << "\n"; } @@ -426,18 +428,18 @@ void debug_print(const std::string& msg, size_t i, size_t j, size_t k, double pa template inline void MakeStringInternal(std::ostringstream& ss, const T& t) noexcept { - ss << t; + ss << t; } template inline void MakeStringInternal(std::ostringstream& ss, const T& t, const Args&... args) noexcept { - MakeStringInternal(ss, t); - MakeStringInternal(ss, args...); + MakeStringInternal(ss, t); + MakeStringInternal(ss, args...); } template inline std::string MakeString(const Args&... args) { - std::ostringstream ss; - MakeStringInternal(ss, args...); - return std::string(ss.str()); + std::ostringstream ss; + MakeStringInternal(ss, args...); + return std::string(ss.str()); } diff --git a/mlprodict/onnxrt/ops_cpu/op_conv_.cpp b/mlprodict/onnxrt/ops_cpu/op_conv_.cpp index 170c235d1..e68a455bb 100644 --- a/mlprodict/onnxrt/ops_cpu/op_conv_.cpp +++ b/mlprodict/onnxrt/ops_cpu/op_conv_.cpp @@ -23,35 +23,17 @@ namespace py = pybind11; template -class Conv : ConvPoolCommon { - - private: - - AutoPadType auto_pad_; - std::vector dilations_; - int64_t group_; - std::vector kernel_shape_; - std::vector pads_; - std::vector strides_; +class Conv : public ConvPoolCommon { public: Conv(); - void init(const std::string &auto_pad, - py::array_t dilations, - int64_t group, - py::array_t kernel_shape, - py::array_t pads, - py::array_t strides); py::array_t compute(py::array_t X, py::array_t W, py::array_t B) const; - private: - - void compute_kernel_shape(const std::vector& weight_shape, - std::vector& kernel_shape) const; + protected: void compute_gil_free(py::array_t X, py::array_t W, py::array_t B, py::array_t& Y, @@ -64,57 +46,10 @@ class Conv : ConvPoolCommon { const std::vector& x_dims, const std::vector& y_dims, const std::vector& w_dims) const; - - void infer_output_shape(const std::vector& input_shape, - const std::vector& kernel_shape, - const std::vector& strides_p, - const std::vector& dilations_p, - std::vector& pads_p, - std::vector& output_shape, - bool ForceSymmetricAutoPadding = false) const; }; template -Conv::Conv() : ConvPoolCommon() { -} - - -template -void Conv::init( - const std::string &auto_pad, - py::array_t dilations, - int64_t group, - py::array_t kernel_shape, - py::array_t pads, - py::array_t strides - ) { - auto_pad_ = to_AutoPadType(auto_pad); - array2vector(dilations_, dilations, int64_t); - group_ = group; - array2vector(dilations_, dilations, int64_t); - array2vector(pads_, pads, int64_t); - array2vector(strides_, strides, int64_t); -} - - -template -void Conv::compute_kernel_shape(const std::vector& weight_shape, - std::vector& kernel_shape) const { - if (kernel_shape_.size() > 0) { - kernel_shape = kernel_shape_; - if (kernel_shape.size() + 2 != weight_shape.size()) - throw std::runtime_error( - "kernel_shape num_dims is not compatible with W num_dims (1)."); - - for (size_t i = 0; i < kernel_shape.size(); ++i) - if (kernel_shape[i] != weight_shape[i + 2]) - throw std::runtime_error( - "kernel_shape num_dims is not compatible with W num_dims (2)."); - } - else { - auto& weight_dims = weight_shape; - kernel_shape = std::vector(weight_dims.begin() + 2, weight_dims.end()); - } +Conv::Conv() : ConvPoolCommon() { } @@ -150,7 +85,7 @@ py::array_t Conv::compute(py::array_t y_dims; y_dims.insert(y_dims.begin(), {N, M}); std::vector input_shape(x_dims.begin() + 2, x_dims.end()); - infer_output_shape(input_shape, kernel_shape, strides, dilations, pads, y_dims); + infer_output_shape(input_shape, kernel_shape, strides, dilations, pads, y_dims, false); std::vector output_shape(y_dims.begin() + 2, y_dims.end()); // py::array::ShapeContainer shape(y_dims); @@ -167,38 +102,6 @@ py::array_t Conv::compute(py::array_t -void Conv::infer_output_shape( - const std::vector& input_shape, - const std::vector& kernel_shape, - const std::vector& strides_p, - const std::vector& dilations_p, - std::vector& pads_p, - std::vector& output_shape, - bool ForceSymmetricAutoPadding) const { - - size_t rank = input_shape.size(); - int64_t dim_size; - - for (size_t dim = 0; dim < rank; ++dim) { - if (dim >= strides_p.size() || dim >= kernel_shape.size() || - dim >= dilations_p.size() || dim >= pads_p.size() || - rank + dim >= pads_p.size()) - throw std::runtime_error("Failure in infer_output_shape."); - - dim_size = 0; - ComputePadAndOutputShape( - input_shape[dim], strides_p[dim], kernel_shape[dim], - dilations_p[dim], auto_pad_, &pads_p.at(dim), - &pads_p.at(input_shape.size() + dim), - &dim_size, ForceSymmetricAutoPadding); - if (dim_size <= 0) - throw std::runtime_error("Invalid argument in infer_output_shape."); - output_shape.push_back(dim_size); - } -} - - template void Conv::compute_gil_free( py::array_t X, py::array_t W, py::array_t B, py::array_t& Y, diff --git a/mlprodict/onnxrt/ops_cpu/op_conv_matrices_.cpp b/mlprodict/onnxrt/ops_cpu/op_conv_matrices_.cpp new file mode 100644 index 000000000..a21168c94 --- /dev/null +++ b/mlprodict/onnxrt/ops_cpu/op_conv_matrices_.cpp @@ -0,0 +1,120 @@ +#include "op_conv_matrices_.hpp" + + +void ComputePadAndOutputShape( + int64_t in_dim, int64_t stride, + int64_t kernel, int64_t dilation, + AutoPadType pad_type, int64_t* pad_head, + int64_t* pad_tail, int64_t* out_dim, + bool ForceSymmetricAutoPadding) { + + const int64_t dkernel = dilation * (kernel - 1) + 1; + + if (pad_type == AutoPadType::NOTSET) { + *out_dim = static_cast(static_cast( + in_dim + *pad_head + *pad_tail - dkernel) / stride + 1); + } + else { + switch (pad_type) { + case AutoPadType::VALID: + *pad_head = 0; + *pad_tail = 0; + *out_dim = (in_dim - dkernel) / stride + 1; + break; + case AutoPadType::SAME_UPPER: + case AutoPadType::SAME_LOWER: { + if (dilation != 1) + throw std::runtime_error( + "Dilation not supported for AutoPadType::SAME_UPPER or AutoPadType::SAME_LOWER."); + int64_t legacy_target_size = (in_dim + stride - 1) / stride; + int64_t pad_needed = (legacy_target_size - 1) * stride + kernel - in_dim; + *out_dim = (in_dim + pad_needed - dkernel) / stride + 1; + + // make sure padding is symmetric + if (ForceSymmetricAutoPadding) + pad_needed = roundUpPow2(pad_needed); + + *pad_head = (pad_type == AutoPadType::SAME_LOWER) + ? (pad_needed + 1) / 2 + : pad_needed / 2; + *pad_tail = pad_needed - *pad_head; + } break; + default: + throw std::runtime_error("Invalid argument in ComputePadAndOutputShape."); + } + } +} + + +void ConvPoolCommonShape::init( + const std::string& auto_pad, + py_array_t kernel_shape) { + auto_pad_ = to_AutoPadType(auto_pad); + array2vector(kernel_shape_, kernel_shape, int64_t); +} + + +void ConvPoolCommonShape::compute_kernel_shape( + const std::vector& weight_shape, + std::vector& kernel_shape) const { + if (kernel_shape_.size() > 0) { + kernel_shape = kernel_shape_; + if (kernel_shape.size() + 2 != weight_shape.size()) + throw std::runtime_error( + "kernel_shape num_dims is not compatible with W num_dims (1)."); + + for (size_t i = 0; i < kernel_shape.size(); ++i) + if (kernel_shape[i] != weight_shape[i + 2]) + throw std::runtime_error( + "kernel_shape num_dims is not compatible with W num_dims (2)."); + } + else { + auto& weight_dims = weight_shape; + kernel_shape = std::vector(weight_dims.begin() + 2, weight_dims.end()); + } +} + + +void ConvPoolCommonShape::infer_output_shape(const std::vector& input_shape, + const std::vector& kernel_shape, + const std::vector& strides_p, + const std::vector& dilations_p, + std::vector& pads_p, + std::vector& output_shape, + bool ForceSymmetricAutoPadding) const { + + size_t rank = input_shape.size(); + int64_t dim_size; + + for (size_t dim = 0; dim < rank; ++dim) { + if (dim >= strides_p.size() || dim >= kernel_shape.size() || + dim >= dilations_p.size() || dim >= pads_p.size() || + rank + dim >= pads_p.size()) + throw std::runtime_error("Failure in infer_output_shape."); + + dim_size = 0; + ComputePadAndOutputShape( + input_shape[dim], strides_p[dim], kernel_shape[dim], + dilations_p[dim], auto_pad_, &pads_p.at(dim), + &pads_p.at(input_shape.size() + dim), + &dim_size, ForceSymmetricAutoPadding); + if (dim_size <= 0) + throw std::runtime_error("Invalid argument in infer_output_shape."); + output_shape.push_back(dim_size); + } +} + + +void ConvPoolCommon::init( + const std::string& auto_pad, + py_array_t dilations, + int64_t group, + py_array_t kernel_shape, + py_array_t pads, + py_array_t strides) { + ConvPoolCommonShape::init(auto_pad, kernel_shape); + array2vector(dilations_, dilations, int64_t); + group_ = group; + array2vector(pads_, pads, int64_t); + array2vector(strides_, strides, int64_t); +} diff --git a/mlprodict/onnxrt/ops_cpu/op_conv_matrices_.hpp b/mlprodict/onnxrt/ops_cpu/op_conv_matrices_.hpp index 8afbb8cd7..92d1bb935 100644 --- a/mlprodict/onnxrt/ops_cpu/op_conv_matrices_.hpp +++ b/mlprodict/onnxrt/ops_cpu/op_conv_matrices_.hpp @@ -7,372 +7,802 @@ #define _CRT_SECURE_NO_WARNINGS #endif +#include "op_common_.hpp" +#define is_a_ge_zero_and_a_lt_b(a, b) (static_cast(a) < static_cast(b)) + + #ifndef SKIP_PYTHON +//#include +#include +#include +#include +//#include + #if USE_OPENMP #include #endif +namespace py = pybind11; + +#define py_array_t py::array_t +#define py_array_style py::array::c_style | py::array::forcecast +#define py_gil_scoped_release py::gil_scoped_release + +#else + +#include +#include +#include +#include +#include // cout +#include +#include + +#define py_array_style int +#define py_gil_scoped_release int + +template +bool cmp_vector(const std::vector& v1, const std::vector& v2) { + if (v1.size() != v2.size()) + return false; + for (size_t i = 0; i < v1.size(); ++i) + if (v1[i] != v2[i]) + return false; + return true; +} + +template +class py_array_t : public std::vector { + std::vector shape_; +public: + py_array_t() : std::vector(), shape_() {} + py_array_t(const std::vector& shape) : std::vector(), shape_(shape) { + int64_t s = 1; + for (auto it : shape) + s *= it; + this->resize(s); + } + py_array_t(const std::vector& values, const std::vector& shape) : + std::vector(values), shape_(shape) { + if (shape.empty() && !!values.empty()) { + shape_.resize(1); + shape_[0] = values.size(); + } + } + const T* data(int64_t p = 0) const { return &((*this)[p]); } + int64_t ndim() const { return shape_.size(); } + int64_t shape(size_t i) const { return shape_[i]; } + bool equal(const py_array_t& value) { + return cmp_vector(shape_, value.shape_) && cmp_vector(*this, value); + } + bool operator == (T* ptr) { + if (ptr == nullptr) + return this->empty(); + throw std::runtime_error("not implemented when ptr != nullptr"); + } +}; + #endif -#include "op_common_.hpp" +template +void TensorTranspose(const T* input, T* output, size_t M, size_t N) { + const T* ptr; + for (size_t i = 0; i < M; ++i) { + ptr = input + i * N; + for (size_t j = 0; j < N; ++j) + output[j * M + i] = ptr[j]; + } +} -#define is_a_ge_zero_and_a_lt_b(a, b) (static_cast(a) < static_cast(b)) + +template +void QConvDepthwise(const T** Input, TI InputZeroPoint, const T* Filter, + TI FilterZeroPoint, bool FilterIsSigned, TI* Output, + size_t Channels, size_t OutputCount, size_t KernelSize) { + // Signed version. + while (OutputCount > 0) { + + size_t ChannelOffset = 0; + size_t c = Channels; + + while (c > 0) { + int32_t Accumulator = 0; + size_t ChannelKernelOffset = ChannelOffset; + + for (size_t k = 0; k < KernelSize; ++k) { + int32_t InputValue = int32_t(Input[k][ChannelOffset]) - InputZeroPoint; + int32_t FilterValue = int32_t(Filter[ChannelKernelOffset]) - FilterZeroPoint; + Accumulator += InputValue * FilterValue; + ChannelKernelOffset += Channels; + } + + *Output++ = Accumulator; + ++ChannelOffset; + --c; + } + + Input += KernelSize; + --OutputCount; + } +} + + +// The function adds value to C, assuming this array +// was initialized. +template +void gemm(bool transA, bool transB, + size_t M, size_t N, size_t K, NTYPE alpha, + const NTYPE* A, const NTYPE* B, NTYPE beta, + NTYPE* C) { + + if (transA) { + if (transB) { + } + else { + // a A B + b C, dimension = M * N + NTYPE* begin; + NTYPE val; + NTYPE val0; + size_t i, j, k, maxc = 0; + const NTYPE* pA, * pB; + for (i = 0, begin = C; i < M; ++i) { + for (j = 0; j < N; ++j, ++begin) { + val0 = *begin * beta; + val = 0; + pA = A + i; + pB = B + j; + for (k = K; k > 0; --k, pA += K, pB += N) + val += *pA * *pB; + *begin = val0 + val * alpha; + maxc = maxc > (size_t)(begin - C) ? maxc : (size_t)(begin - C); + if (maxc > M * N) + throw std::runtime_error("gemm10: maxc > M * N"); + } + } + return; + } + } + else { + if (transB) { + } + else { + // a A B + b C, dimension = M * N + NTYPE* begin; + NTYPE val; + NTYPE val0; + size_t i, j, k, maxc = 0; + const NTYPE* pA, * pB; + for (i = 0, begin = C; i < M; ++i) { + for (j = 0; j < N; ++j, ++begin) { + val0 = *begin * beta; + val = 0; + pA = A + i * K; + pB = B + j; + for (k = K; k > 0; --k, ++pA, pB += N) + val += *pA * *pB; + *begin = val0 + val * alpha; + maxc = maxc > (size_t)(begin - C) ? maxc : (size_t)(begin - C); + if (maxc > M * N) + throw std::runtime_error("gemm00: maxc > M * N"); + } + } + return; + } + } + throw std::runtime_error("Not implemented for transposed matrices (Gemm)."); +} + + +// NTYPE is uint8_t or int8_t +template +void QGemm(bool transA, bool transB, + size_t M, size_t N, size_t K, NTYPE alpha, + const NTYPE* A, const NTYPE* B, NTYPE beta, + int32_t* C, size_t lda, size_t ldb, size_t ldc, + NTYPE ZeroPointA = 0, + const NTYPE* ZeroPointB = nullptr, + bool BIsPacked = false, + bool PerColumnZeroPoints = false) { + + /* + struct GemmShapeParams { + size_t M = 0; + size_t N = 0; + size_t K = 0; + bool BIsSigned = false}; + struct GemmDataParams { + const uint8_t* A = nullptr; + size_t lda = 0; + uint8_t ZeroPointA = 0; + const void* B = 0; + size_t ldb = 0; + const uint8_t* ZeroPointB = nullptr; + bool BIsPacked = false; + bool PerColumnZeroPoints = false; + int32_t* C = nullptr; + size_t ldc = 0}; + */ + if (alpha != 1) + throw std::runtime_error("Not implemented for alpha != 1 (QGemm)."); + if (beta != 1) + throw std::runtime_error("Not implemented for beta != 1 (QGemm)."); + if (transA) { + if (transB) { + } + else { + } + } + else { + if (transB) { + } + else { + // a A B + b C, dimension = M * N + int32_t* begin; + NTYPE val; + NTYPE val0; + size_t i, j, k, maxc = 0; + const NTYPE* pA, * pB; + for (i = 0, begin = C; i < M; ++i) { + for (j = 0; j < N; ++j, ++begin) { + val0 = *begin; /* * beta;*/ + val = 0; + pA = A + i * K; + pB = B + j; + for (k = K; k > 0; --k, ++pA, pB += N) + val += *pA * *pB; + *begin = val0 + val; /* * alpha;*/ + maxc = maxc > (size_t)(begin - C) ? maxc : (size_t)(begin - C); + if (maxc > M * N) + throw std::runtime_error("qgemm00: maxc > M * N"); + } + } + return; + } + } + throw std::runtime_error("Not implemented for transposed matrices (QGemm)."); +} template static void Im2colWithEqualPadding( - int64_t output_h, int64_t output_w, const T* data_im, int64_t channels, - int64_t height, int64_t width, int64_t kernel_h, int64_t kernel_w, - int64_t dilation_h, int64_t dilation_w, int64_t pad_t, int64_t pad_l, - int64_t stride_h, int64_t stride_w, T* data_col, T padding_value) { - // From Intel, https://github.com/BVLC/caffe/pull/3536 - int64_t pad_h = pad_t; - int64_t pad_w = pad_l; - int64_t channel_size = height * width; - for (int64_t channel = channels; channel--; data_im += channel_size) { - for (int64_t kernel_row = 0; kernel_row < kernel_h; kernel_row++) { - for (int64_t kernel_col = 0; kernel_col < kernel_w; kernel_col++) { - int64_t input_row = -pad_h + kernel_row * dilation_h; - for (int64_t output_rows = output_h; output_rows; output_rows--) { - if (!is_a_ge_zero_and_a_lt_b(input_row, height)) { - std::fill_n(data_col, output_w, padding_value); - data_col += output_w; - } - else { - int64_t input_col = -pad_w + kernel_col * dilation_w; - const T* rdptr = data_im + input_row * width + input_col; - for (int64_t i = 0; i != output_w; ++i) { - *data_col = is_a_ge_zero_and_a_lt_b(input_col, width) - ? rdptr[i * stride_w] - : padding_value; - input_col += stride_w; - ++data_col; - } - } - input_row += stride_h; - } - } - } - } + int64_t output_h, int64_t output_w, const T* data_im, int64_t channels, + int64_t height, int64_t width, int64_t kernel_h, int64_t kernel_w, + int64_t dilation_h, int64_t dilation_w, int64_t pad_t, int64_t pad_l, + int64_t stride_h, int64_t stride_w, T* data_col, T padding_value) { + // From Intel, https://github.com/BVLC/caffe/pull/3536 + int64_t pad_h = pad_t; + int64_t pad_w = pad_l; + int64_t channel_size = height * width; + for (int64_t channel = channels; channel--; data_im += channel_size) { + for (int64_t kernel_row = 0; kernel_row < kernel_h; kernel_row++) { + for (int64_t kernel_col = 0; kernel_col < kernel_w; kernel_col++) { + int64_t input_row = -pad_h + kernel_row * dilation_h; + for (int64_t output_rows = output_h; output_rows; output_rows--) { + if (!is_a_ge_zero_and_a_lt_b(input_row, height)) { + std::fill_n(data_col, output_w, padding_value); + data_col += output_w; + } + else { + int64_t input_col = -pad_w + kernel_col * dilation_w; + const T* rdptr = data_im + input_row * width + input_col; + for (int64_t i = 0; i != output_w; ++i) { + *data_col = is_a_ge_zero_and_a_lt_b(input_col, width) + ? rdptr[i * stride_w] + : padding_value; + input_col += stride_w; + ++data_col; + } + } + input_row += stride_h; + } + } + } + } } template void Im2colNd_NCHW( - const T* data_img, const int64_t* im_shape, - const int64_t* col_shape, int64_t /*img_size*/, - int64_t /*col_size*/, const int64_t* kernel_shape, - const int64_t* stride, const int64_t* dilation, - const int64_t* pad, int64_t N, T* data_col, - bool accumulate_output = false, - T padding_value = 0) { - int64_t kernel_size = 1; - for (int64_t i = 0; i < N; ++i) - kernel_size *= kernel_shape[i]; - - int64_t channels_col = col_shape[0]; - std::vector d_offset(N, 0); - std::vector d_iter(N, 0); - - for (int64_t c_col = 0; c_col < channels_col; ++c_col) { - // Loop over spatial axes in reverse order to compute a per-axis offset. - int64_t offset = c_col; - for (int64_t d_i = N - 1; d_i >= 0; --d_i) { - if (d_i < N - 1) - offset /= kernel_shape[d_i + 1]; - d_offset[d_i] = offset % kernel_shape[d_i]; - } - for (bool incremented = true; incremented;) { - // Loop over spatial axes in forward order to compute the indices in the - // image and column, and whether the index lies in the padding. - int64_t index_col = c_col; - int64_t index_im = c_col / kernel_size; - bool is_padding = false; - for (int64_t d_i = 0; d_i < N; ++d_i) { - int64_t d = d_iter[d_i]; - int64_t d_im = d * stride[d_i] - pad[d_i] + d_offset[d_i] * dilation[d_i]; - is_padding |= d_im < 0 || d_im >= im_shape[d_i + 1]; - index_col *= col_shape[d_i + 1]; - index_col += d; - index_im *= im_shape[d_i + 1]; - index_im += d_im; - } - if (!accumulate_output) { - if (is_padding) - data_col[index_col] = padding_value; - else - data_col[index_col] = data_img[index_im]; - } - else if (!is_padding) // col2im - data_col[index_im] += data_img[index_col]; - - // Loop over spatial axes in reverse order to choose an index, - // like counting. - incremented = false; - for (int64_t d_i = N - 1; d_i >= 0; --d_i) { - int64_t d_max = col_shape[d_i + 1]; - // ORT_ENFORCE(d_iter[d_i] < d_max); - if (d_iter[d_i] == d_max - 1) - d_iter[d_i] = 0; - else { // d_iter[d_i] < d_max - 1 - ++d_iter[d_i]; - incremented = true; - break; - } - } - } // while(incremented) { - } // for (int c = 0; c < channels_col; ++c) { + const T* data_img, const int64_t* im_shape, + const int64_t* col_shape, int64_t /*img_size*/, + int64_t /*col_size*/, const int64_t* kernel_shape, + const int64_t* stride, const int64_t* dilation, + const int64_t* pad, int64_t N, T* data_col, + bool accumulate_output = false, + T padding_value = 0) { + int64_t kernel_size = 1; + for (int64_t i = 0; i < N; ++i) + kernel_size *= kernel_shape[i]; + + int64_t channels_col = col_shape[0]; + std::vector d_offset(N, 0); + std::vector d_iter(N, 0); + + for (int64_t c_col = 0; c_col < channels_col; ++c_col) { + // Loop over spatial axes in reverse order to compute a per-axis offset. + int64_t offset = c_col; + for (int64_t d_i = N - 1; d_i >= 0; --d_i) { + if (d_i < N - 1) + offset /= kernel_shape[d_i + 1]; + d_offset[d_i] = offset % kernel_shape[d_i]; + } + for (bool incremented = true; incremented;) { + // Loop over spatial axes in forward order to compute the indices in the + // image and column, and whether the index lies in the padding. + int64_t index_col = c_col; + int64_t index_im = c_col / kernel_size; + bool is_padding = false; + for (int64_t d_i = 0; d_i < N; ++d_i) { + int64_t d = d_iter[d_i]; + int64_t d_im = d * stride[d_i] - pad[d_i] + d_offset[d_i] * dilation[d_i]; + is_padding |= d_im < 0 || d_im >= im_shape[d_i + 1]; + index_col *= col_shape[d_i + 1]; + index_col += d; + index_im *= im_shape[d_i + 1]; + index_im += d_im; + } + if (!accumulate_output) { + if (is_padding) + data_col[index_col] = padding_value; + else + data_col[index_col] = data_img[index_im]; + } + else if (!is_padding) // col2im + data_col[index_im] += data_img[index_col]; + + // Loop over spatial axes in reverse order to choose an index, + // like counting. + incremented = false; + for (int64_t d_i = N - 1; d_i >= 0; --d_i) { + int64_t d_max = col_shape[d_i + 1]; + // ORT_ENFORCE(d_iter[d_i] < d_max); + if (d_iter[d_i] == d_max - 1) + d_iter[d_i] = 0; + else { // d_iter[d_i] < d_max - 1 + ++d_iter[d_i]; + incremented = true; + break; + } + } + } // while(incremented) { + } // for (int c = 0; c < channels_col; ++c) { } template void Im2col_NCHW( - const T* data_im, int64_t channels, - int64_t height, int64_t width, - int64_t kernel_h, int64_t kernel_w, - int64_t dilation_h, int64_t dilation_w, - int64_t pad_t, int64_t pad_l, int64_t pad_b, int64_t pad_r, - int64_t stride_h, int64_t stride_w, T* data_col, - T padding_value = 0) { - const int64_t output_h = - (height + pad_b + pad_t - (dilation_h * (kernel_h - 1) + 1)) - / stride_h + 1; - const int64_t output_w = - (width + pad_l + pad_r - (dilation_w * (kernel_w - 1) + 1)) - / stride_w + 1; - - // Fast path for zero padding and no dilation - // From Torch, THNN_(unfolded_copy) - if (dilation_h == 1 && dilation_w == 1 && pad_l == 0 && pad_r == 0 && - pad_t == 0 && pad_b == 0) { - for (auto k = 0; k < channels * kernel_h * kernel_w; k++) { - const auto nip = k / (kernel_h * kernel_w); - const auto rest = k % (kernel_h * kernel_w); - const auto kh = rest / kernel_w; - const auto kw = rest % kernel_w; - auto* dst = data_col + nip * (kernel_h * kernel_w * output_h * output_w) + - kh * (kernel_w * output_h * output_w) + kw * (output_h * output_w); - const auto* src = data_im + nip * (height * width); - for (auto y = 0; y < output_h; y++) { - const auto iy = y * stride_h + kh; - const auto ix = kw; - if (stride_w == 1) { - memcpy( - dst + (y * output_w), - src + (iy * width + ix), - sizeof(T) * output_w); - } - else { - for (auto x = 0; x < output_w; x++) { - memcpy( - dst + (y * output_w + x), - src + (iy * width + ix + x * stride_w), - sizeof(T)); - } - } - } - } - return; - } - - // Fast path for equal padding - if (pad_l == pad_r && pad_t == pad_b) { - Im2colWithEqualPadding( - output_h, output_w, data_im, channels, height, width, - kernel_h, kernel_w, dilation_h, dilation_w, pad_t, pad_l, - stride_h, stride_w, data_col, padding_value); - return; - } - - // Baseline - const int64_t dkernel_h = dilation_h * (kernel_h - 1) + 1; - const int64_t dkernel_w = dilation_w * (kernel_w - 1) + 1; - - int64_t height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1; - int64_t width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1; - - int64_t channels_col = channels * kernel_h * kernel_w; - for (int64_t c = 0; c < channels_col; ++c) { - int64_t w_offset = c % kernel_w; - int64_t h_offset = (c / kernel_w) % kernel_h; - int64_t c_im = c / kernel_h / kernel_w; - for (int64_t h = 0; h < height_col; ++h) { - for (int64_t w = 0; w < width_col; ++w) { - int64_t h_pad = h * stride_h - pad_t + h_offset * dilation_h; - int64_t w_pad = w * stride_w - pad_l + w_offset * dilation_w; - if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width) - data_col[(c * height_col + h) * width_col + w] = - data_im[(c_im * height + h_pad) * width + w_pad]; - else - data_col[(c * height_col + h) * width_col + w] = padding_value; - } - } - } + const T* data_im, int64_t channels, + int64_t height, int64_t width, + int64_t kernel_h, int64_t kernel_w, + int64_t dilation_h, int64_t dilation_w, + int64_t pad_t, int64_t pad_l, int64_t pad_b, int64_t pad_r, + int64_t stride_h, int64_t stride_w, T* data_col, + T padding_value = 0) { + const int64_t output_h = + (height + pad_b + pad_t - (dilation_h * (kernel_h - 1) + 1)) + / stride_h + 1; + const int64_t output_w = + (width + pad_l + pad_r - (dilation_w * (kernel_w - 1) + 1)) + / stride_w + 1; + + // Fast path for zero padding and no dilation + // From Torch, THNN_(unfolded_copy) + if (dilation_h == 1 && dilation_w == 1 && pad_l == 0 && pad_r == 0 && + pad_t == 0 && pad_b == 0) { + for (auto k = 0; k < channels * kernel_h * kernel_w; k++) { + const auto nip = k / (kernel_h * kernel_w); + const auto rest = k % (kernel_h * kernel_w); + const auto kh = rest / kernel_w; + const auto kw = rest % kernel_w; + auto* dst = data_col + nip * (kernel_h * kernel_w * output_h * output_w) + + kh * (kernel_w * output_h * output_w) + kw * (output_h * output_w); + const auto* src = data_im + nip * (height * width); + for (auto y = 0; y < output_h; y++) { + const auto iy = y * stride_h + kh; + const auto ix = kw; + if (stride_w == 1) { + memcpy( + dst + (y * output_w), + src + (iy * width + ix), + sizeof(T) * output_w); + } + else { + for (auto x = 0; x < output_w; x++) { + memcpy( + dst + (y * output_w + x), + src + (iy * width + ix + x * stride_w), + sizeof(T)); + } + } + } + } + return; + } + + // Fast path for equal padding + if (pad_l == pad_r && pad_t == pad_b) { + Im2colWithEqualPadding( + output_h, output_w, data_im, channels, height, width, + kernel_h, kernel_w, dilation_h, dilation_w, pad_t, pad_l, + stride_h, stride_w, data_col, padding_value); + return; + } + + // Baseline + const int64_t dkernel_h = dilation_h * (kernel_h - 1) + 1; + const int64_t dkernel_w = dilation_w * (kernel_w - 1) + 1; + + int64_t height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1; + int64_t width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1; + + int64_t channels_col = channels * kernel_h * kernel_w; + for (int64_t c = 0; c < channels_col; ++c) { + int64_t w_offset = c % kernel_w; + int64_t h_offset = (c / kernel_w) % kernel_h; + int64_t c_im = c / kernel_h / kernel_w; + for (int64_t h = 0; h < height_col; ++h) { + for (int64_t w = 0; w < width_col; ++w) { + int64_t h_pad = h * stride_h - pad_t + h_offset * dilation_h; + int64_t w_pad = w * stride_w - pad_l + w_offset * dilation_w; + if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width) + data_col[(c * height_col + h) * width_col + w] = + data_im[(c_im * height + h_pad) * width + w_pad]; + else + data_col[(c * height_col + h) * width_col + w] = padding_value; + } + } + } +} + + +// Loop over spatial axes in reverse order to choose an index, like counting. +inline bool NextPosition(int64_t N, const int64_t* shape, int64_t* dims) { + bool has_next_output = false; + for (int64_t d_i = N - 1; d_i >= 0; --d_i) { + int64_t d_max = shape[d_i]; + // assert dims[d_i] < d_max + if (dims[d_i] == d_max - 1) { + dims[d_i] = 0; + } + else { // dims[d_i] < d_max - 1 + ++dims[d_i]; + has_next_output = true; + break; + } + } + return has_next_output; } template -void ComputePadAndOutputShape( - const int64_t in_dim, const int64_t stride, - const int64_t kernel, const int64_t dilation, - AutoPadType pad_type, int64_t* pad_head, - int64_t* pad_tail, int64_t* out_dim, - bool ForceSymmetricAutoPadding) { - - const int64_t dkernel = dilation * (kernel - 1) + 1; - - if (pad_type == AutoPadType::NOTSET) { - *out_dim = static_cast(static_cast( - in_dim + *pad_head + *pad_tail - dkernel) / stride + 1); - } - else { - switch (pad_type) { - case AutoPadType::VALID: - *pad_head = 0; - *pad_tail = 0; - *out_dim = (in_dim - dkernel) / stride + 1; - break; - case AutoPadType::SAME_UPPER: - case AutoPadType::SAME_LOWER: { - if (dilation != 1) - throw std::runtime_error( - "Dilation not supported for AutoPadType::SAME_UPPER or AutoPadType::SAME_LOWER."); - int64_t legacy_target_size = (in_dim + stride - 1) / stride; - int64_t pad_needed = (legacy_target_size - 1) * stride + kernel - in_dim; - *out_dim = (in_dim + pad_needed - dkernel) / stride + 1; - - // make sure padding is symmetric - if (ForceSymmetricAutoPadding) - pad_needed = roundUpPow2(pad_needed); - - *pad_head = (pad_type == AutoPadType::SAME_LOWER) - ? (pad_needed + 1) / 2 - : pad_needed / 2; - *pad_tail = pad_needed - *pad_head; - } break; - default: - throw std::runtime_error("Invalid argument in ComputePadAndOutputShape."); - } - } +void Im2col_NCHW(const T* data_im, + int64_t group_channels, + int64_t input_channels, + const int64_t* im_shape, + const int64_t* output_shape, + const int64_t* kernel_shape, + const int64_t* stride, + const int64_t* dilation, + const int64_t* pad, + ptrdiff_t rank, + T* data_col, + T padding_value) { + // iterate dimensions on output image shape (without Batch and Channel) + std::vector d_output(rank, 0); + // inner iterate dimensions on kernel shape (without output channel and input channel) + std::vector d_kernel(rank, 0); + + // Loop over spatial axes along the output image shape + do { + // Loop over spatial axes in reverse order to choose an index on kernel dimensions + do { + // Loop over spatial axes in forward order to compute the indices in the image + // and the inner col, and whether the index lies in the padding. + int64_t index_im = 0; + bool is_padding = false; + for (ptrdiff_t d_i = 0; d_i < rank; ++d_i) { + int64_t d_im = d_output[d_i] * stride[d_i] - pad[d_i] + d_kernel[d_i] * dilation[d_i]; + is_padding |= !is_a_ge_zero_and_a_lt_b(d_im, im_shape[d_i]); + index_im *= im_shape[d_i]; + index_im += d_im; + } + index_im *= input_channels; + + if (is_padding) { + data_col = std::fill_n(data_col, group_channels, padding_value); + } + else { + data_col = std::copy_n(data_im + index_im, group_channels, data_col); + } + } while (NextPosition(rank, kernel_shape, d_kernel.data())); + } while (NextPosition(rank, output_shape, d_output.data())); } + template -void ComputeTransposePadAndOutputShape( - const int64_t in_size, const int64_t stride, - const int64_t kernel, const int64_t dilation, - const int64_t adj, AutoPadType pad_type, - int64_t* pad_head, int64_t* pad_tail, - int64_t* out_size) { - if (*out_size != -1) { - // total padding size - int64_t paddings = std::max(0, (in_size - 1) * stride + adj + (kernel - 1) * dilation + 1 - *out_size); - if (pad_type == AutoPadType::SAME_UPPER) { // pad more on head when paddings are odd. - *pad_head = paddings - paddings / 2; - *pad_tail = paddings / 2; - } - else { - // for pad_type is NOTSET, SAME_LOWER or VALID - // set pad_head as paddings/2, pad_tail as paddings-paddings/2. - // That said, we pad more on tail when paddings are odd. - *pad_head = paddings / 2; - *pad_tail = paddings - paddings / 2; - } - return; - } - if (pad_type != AutoPadType::NOTSET) { - switch (pad_type) { - // We handle cases of AutoPadType::VALID and AutoPadType::SAME_UPPER/LOWER, - // the same way - case AutoPadType::VALID: - case AutoPadType::SAME_UPPER: - case AutoPadType::SAME_LOWER: - *pad_head = 0; - *pad_tail = 0; - *out_size = (in_size - 1) * stride + adj + (kernel - 1) * dilation + 1; - break; - default: - throw std::runtime_error("pad type not supported"); - } - } - else { - *out_size = (in_size - 1) * stride + adj + - (kernel - 1) * dilation + 1 - - *pad_head - *pad_tail; - } +void Im2col_NHWC(const T* data_im, + int64_t input_channels, + const int64_t* input_shape, + const int64_t* output_shape, + const int64_t* kernel_shape, + const int64_t* stride, + const int64_t* dilation, + const int64_t* pad, + ptrdiff_t rank, + int64_t output_start, + int64_t output_count, + T const ** data_indirection, + const T* padding_ptr) { + if (rank == 1) { + int64_t stride_w = stride[0]; + int64_t kernel_w = kernel_shape[0]; + int64_t dilation_w = dilation[0]; + int64_t pad_l = pad[0]; + int64_t input_w = input_shape[0]; + + int64_t ow = output_start * stride_w; + + while (output_count--) { + int64_t iw = ow - pad_l; + for (int64_t kw = 0; kw < kernel_w; kw++) { + const T* data_ptr = data_im + iw * input_channels; + data_indirection[kw] = (is_a_ge_zero_and_a_lt_b(iw, input_w) ? data_ptr : padding_ptr); + iw += dilation_w; + } + data_indirection += kernel_w; + ow += stride_w; + } + } + else if (rank == 2) { + int64_t stride_h = stride[0]; + int64_t stride_w = stride[1]; + int64_t kernel_h = kernel_shape[0]; + int64_t kernel_w = kernel_shape[1]; + int64_t dilation_h = dilation[0]; + int64_t dilation_w = dilation[1]; + int64_t pad_t = pad[0]; + int64_t pad_l = pad[1]; + int64_t input_h = input_shape[0]; + int64_t input_w = input_shape[1]; + int64_t output_w = output_shape[1]; + + int64_t oh = (output_start / output_w) * stride_h; + int64_t ow = (output_start % output_w) * stride_w; + int64_t ow_end = output_w * stride_w; + + while (output_count--) { + for (int64_t kh = 0; kh < kernel_h; kh++) { + int64_t ih = kh * dilation_h + oh - pad_t; + if (is_a_ge_zero_and_a_lt_b(ih, input_h)) { + int64_t ihw = ih * input_w; + int64_t iw = ow - pad_l; + for (int64_t kw = 0; kw < kernel_w; kw++) { + const T* data_ptr = data_im + (ihw + iw) * input_channels; + data_indirection[kw] = (is_a_ge_zero_and_a_lt_b(iw, input_w) ? data_ptr : padding_ptr); + iw += dilation_w; + } + } + else { + std::fill_n(data_indirection, kernel_w, padding_ptr); + } + data_indirection += kernel_w; + } + ow += stride_w; + if (ow == ow_end) { + oh += stride_h; + ow = 0; + } + } + } + else { + // iterate dimensions on output image shape (without Batch and Channel) + std::vector d_output(rank, 0); + // inner iterate dimensions on kernel shape (without output channel and input channel) + std::vector d_kernel(rank, 0); + + // Skip ahead to the starting output index. + for (ptrdiff_t d_i = rank - 1; d_i >= 0; --d_i) { + d_output[d_i] = output_start % output_shape[d_i]; + output_start /= output_shape[d_i]; + } + + while (output_count--) { + // Loop over spatial axes in reverse order to choose an index on kernel dimensions + do { + // Loop over spatial axes in forward order to compute the indices in the image + // and the inner col, and whether the index lies in the padding. + int64_t index_im = 0; + bool is_padding = false; + for (ptrdiff_t d_i = 0; d_i < rank; ++d_i) { + int64_t d_input = d_output[d_i] * stride[d_i] - pad[d_i] + d_kernel[d_i] * dilation[d_i]; + is_padding |= !is_a_ge_zero_and_a_lt_b(d_input, input_shape[d_i]); + index_im *= input_shape[d_i]; + index_im += d_input; + } + const T* data_ptr = data_im + index_im * input_channels; + *data_indirection++ = is_padding ? padding_ptr : data_ptr; + } while (NextPosition(rank, kernel_shape, d_kernel.data())); + // Loop over spatial axes along the output image shape + NextPosition(rank, output_shape, d_output.data()); + } + } } -// The function adds value to C, assuming this array -// was initialized. -template -void gemm(bool transA, bool transB, - size_t M, size_t N, size_t K, NTYPE alpha, - const NTYPE* A, const NTYPE* B, NTYPE beta, - NTYPE* C) { - - if (transA) { - if (transB) { - } - else { - // a A B + b C, dimension = M * N - NTYPE* begin; - NTYPE val; - NTYPE val0; - size_t i, j, k, maxc=0; - const NTYPE *pA, *pB; - for(i = 0, begin = C; i < M; ++i) { - for(j = 0; j < N; ++j, ++begin) { - val0 = *begin * beta; - val = 0; - pA = A + i; - pB = B + j; - for(k = K; k > 0; --k, pA += K, pB += N) - val += *pA * *pB; - *begin = val0 + val * alpha; - maxc = maxc > (size_t)(begin - C) ? maxc : (size_t)(begin - C); - if (maxc > M * N) - throw std::runtime_error("gemm10: maxc > M * N"); - } - } - return; - } - } - else { - if (transB) { - } - else { - // a A B + b C, dimension = M * N - NTYPE* begin; - NTYPE val; - NTYPE val0; - size_t i, j, k, maxc=0; - const NTYPE *pA, *pB; - for(i = 0, begin = C; i < M; ++i) { - for(j = 0; j < N; ++j, ++begin) { - val0 = *begin * beta; - val = 0; - pA = A + i * K; - pB = B + j; - for(k = K; k > 0; --k, ++pA, pB += N) - val += *pA * *pB; - *begin = val0 + val * alpha; - maxc = maxc > (size_t)(begin - C) ? maxc : (size_t)(begin - C); - if (maxc > M * N) - throw std::runtime_error("gemm00: maxc > M * N"); - } - } - return; - } - } - throw std::runtime_error("Not implemented for transposed matrices."); -} +template +void Im2col_NHWC(const T* data_im, + int64_t group_channels, + int64_t input_channels, + int64_t input_h, + int64_t input_w, + int64_t kernel_h, + int64_t kernel_w, + int64_t dilation_h, + int64_t dilation_w, + int64_t pad_t, + int64_t pad_l, + int64_t stride_h, + int64_t stride_w, + int64_t output_w, + int64_t output_start, + int64_t output_count, + T* data_col, + T padding_value) { + int64_t mh = output_start / output_w; + int64_t mw = output_start % output_w; + for (int64_t mz = output_start; mz < output_start + output_count; mz++) { + int64_t oh = mh * stride_h; + int64_t ow = mw * stride_w; + + for (int64_t kh = 0; kh < kernel_h; kh++) { + int64_t ih = kh * dilation_h + oh - pad_t; + + if (is_a_ge_zero_and_a_lt_b(ih, input_h)) { + int64_t iw = ow - pad_l; + if (dilation_w == 1 && group_channels == input_channels) { + int64_t kw = kernel_w; + while (kw > 0) { + if (is_a_ge_zero_and_a_lt_b(iw, input_w)) { + // Increase the copy count size to reduce the number of copy calls. + int64_t batch_w = std::min(kw, input_w - iw); + std::memcpy(data_col, data_im + (ih * input_w + iw) * group_channels, + static_cast(sizeof(T) * batch_w * group_channels)); + data_col += batch_w * group_channels; + iw += batch_w; + kw -= batch_w; + } + else { + data_col = std::fill_n(data_col, group_channels, padding_value); + iw++; + kw--; + } + } + } + else { + for (int64_t kw = 0; kw < kernel_w; kw++) { + if (is_a_ge_zero_and_a_lt_b(iw, input_w)) { + // N.B. Using std::memcpy helped here over std::copy_n when doing a + // transform for an image with a small number of group channels. + std::memcpy(data_col, data_im + (ih * input_w + iw) * input_channels, + static_cast(sizeof(T) * group_channels)); + data_col += group_channels; + } + else { + data_col = std::fill_n(data_col, group_channels, padding_value); + } + iw += dilation_w; + } + } + } + else { + data_col = std::fill_n(data_col, kernel_w * group_channels, padding_value); + } + } + + if (++mw == output_w) { + ++mh; + mw = 0; + } + } +} +void ComputePadAndOutputShape( + int64_t in_dim, int64_t stride, + int64_t kernel, int64_t dilation, + AutoPadType pad_type, int64_t* pad_head, + int64_t* pad_tail, int64_t* out_dim, + bool ForceSymmetricAutoPadding); + template -class ConvPoolCommon { +void ComputeTransposePadAndOutputShape( + int64_t in_size, int64_t stride, + int64_t kernel, int64_t dilation, + int64_t adj, AutoPadType pad_type, + int64_t* pad_head, int64_t* pad_tail, + int64_t* out_size) { + if (*out_size != -1) { + // total padding size + int64_t paddings = std::max(0, (in_size - 1) * stride + adj + (kernel - 1) * dilation + 1 - *out_size); + if (pad_type == AutoPadType::SAME_UPPER) { // pad more on head when paddings are odd. + *pad_head = paddings - paddings / 2; + *pad_tail = paddings / 2; + } + else { + // for pad_type is NOTSET, SAME_LOWER or VALID + // set pad_head as paddings/2, pad_tail as paddings-paddings/2. + // That said, we pad more on tail when paddings are odd. + *pad_head = paddings / 2; + *pad_tail = paddings - paddings / 2; + } + return; + } + if (pad_type != AutoPadType::NOTSET) { + switch (pad_type) { + // We handle cases of AutoPadType::VALID and AutoPadType::SAME_UPPER/LOWER, + // the same way + case AutoPadType::VALID: + case AutoPadType::SAME_UPPER: + case AutoPadType::SAME_LOWER: + *pad_head = 0; + *pad_tail = 0; + *out_size = (in_size - 1) * stride + adj + (kernel - 1) * dilation + 1; + break; + default: + throw std::runtime_error("pad type not supported"); + } + } + else { + *out_size = (in_size - 1) * stride + adj + + (kernel - 1) * dilation + 1 - + *pad_head - *pad_tail; + } +} + + +class ConvPoolCommonShape { + +protected: + + AutoPadType auto_pad_; + std::vector kernel_shape_; + +public: + + ConvPoolCommonShape() {} + + void init(const std::string& auto_pad, py_array_t kernel_shape); + void compute_kernel_shape(const std::vector& weight_shape, std::vector& kernel_shape) const; + + void infer_output_shape( + const std::vector& input_shape, + const std::vector& kernel_shape, + const std::vector& strides_p, + const std::vector& dilations_p, + std::vector& pads_p, + std::vector& output_shape, + bool ForceSymmetricAutoPadding) const; }; +class ConvPoolCommon : public ConvPoolCommonShape { + +protected: + + std::vector dilations_; + int64_t group_; + std::vector pads_; + std::vector strides_; + +public: + + void init(const std::string& auto_pad, + py_array_t dilations, + int64_t group, + py_array_t kernel_shape, + py_array_t pads, + py_array_t strides); +}; diff --git a/mlprodict/onnxrt/ops_cpu/op_conv_transpose_.cpp b/mlprodict/onnxrt/ops_cpu/op_conv_transpose_.cpp index 62c7019e6..7bba1b8de 100644 --- a/mlprodict/onnxrt/ops_cpu/op_conv_transpose_.cpp +++ b/mlprodict/onnxrt/ops_cpu/op_conv_transpose_.cpp @@ -23,16 +23,10 @@ namespace py = pybind11; template -class ConvTranspose : ConvPoolCommon { +class ConvTranspose : ConvPoolCommon { - private: + protected: - AutoPadType auto_pad_; - std::vector dilations_; - int64_t group_; - std::vector kernel_shape_; - std::vector pads_; - std::vector strides_; std::vector output_padding_; std::vector output_shape_; @@ -83,7 +77,7 @@ class ConvTranspose : ConvPoolCommon { }; template -ConvTranspose::ConvTranspose() : ConvPoolCommon() { +ConvTranspose::ConvTranspose() : ConvPoolCommon() { } @@ -96,14 +90,8 @@ void ConvTranspose::init( py::array_t pads, py::array_t strides, py::array_t output_padding, - py::array_t output_shape - ) { - auto_pad_ = to_AutoPadType(auto_pad); - array2vector(dilations_, dilations, int64_t); - group_ = group; - array2vector(dilations_, dilations, int64_t); - array2vector(pads_, pads, int64_t); - array2vector(strides_, strides, int64_t); + py::array_t output_shape) { + ConvPoolCommon::init(auto_pad, dilations, group, kernel_shape, pads, strides); array2vector(output_padding_, output_padding, int64_t); array2vector(output_shape_, output_shape, int64_t); } diff --git a/mlprodict/onnxrt/ops_cpu/op_max_pool_.cpp b/mlprodict/onnxrt/ops_cpu/op_max_pool_.cpp index afb64c8e8..15d898fe4 100644 --- a/mlprodict/onnxrt/ops_cpu/op_max_pool_.cpp +++ b/mlprodict/onnxrt/ops_cpu/op_max_pool_.cpp @@ -23,18 +23,12 @@ namespace py = pybind11; template -class MaxPool : ConvPoolCommon { +class MaxPool : ConvPoolCommon { private: - AutoPadType auto_pad_; int64_t ceil_mode_; int64_t storage_order_; - std::vector pads_; - std::vector strides_; - std::vector kernel_shape_; - std::vector dilations_; - bool global_pooling_; public: @@ -91,7 +85,7 @@ class MaxPool : ConvPoolCommon { template -MaxPool::MaxPool() : ConvPoolCommon() { +MaxPool::MaxPool() : ConvPoolCommon() { global_pooling_ = false; } @@ -104,16 +98,10 @@ void MaxPool::init( int64_t storage_order, py::array_t kernel_shape, py::array_t pads, - py::array_t strides - ) { - auto_pad_ = to_AutoPadType(auto_pad); - array2vector(dilations_, dilations, int64_t); + py::array_t strides) { + ConvPoolCommon::init(auto_pad, dilations, 0, kernel_shape, pads, strides); ceil_mode_ = ceil_mode; storage_order_ = storage_order; - array2vector(dilations_, dilations, int64_t); - array2vector(pads_, pads, int64_t); - array2vector(strides_, strides, int64_t); - array2vector(kernel_shape_, kernel_shape, int64_t); } diff --git a/mlprodict/onnxrt/ops_cpu/op_qlinear_conv.py b/mlprodict/onnxrt/ops_cpu/op_qlinear_conv.py new file mode 100644 index 000000000..5317e7a8f --- /dev/null +++ b/mlprodict/onnxrt/ops_cpu/op_qlinear_conv.py @@ -0,0 +1,58 @@ +# -*- encoding: utf-8 -*- +# pylint: disable=E0203,E1101,C0111 +""" +@file +@brief Runtime operator. +""" +import numpy +from ._op import OpRun +from ..shape_object import ShapeObject +from .op_qlinear_conv_ import QLinearConvInt8, QLinearConvUInt8 # pylint: disable=E0611,E0401 + + +class QLinearConv(OpRun): + + atts = {'auto_pad': 'NOTSET', + 'group': 1, + 'dilations': [], + 'kernel_shape': [], + 'pads': [], + 'strides': []} + + def __init__(self, onnx_node, desc=None, **options): + OpRun.__init__(self, onnx_node, desc=desc, + expected_attributes=QLinearConv.atts, + **options) + self._init() + self._cstu8 = numpy.array([], dtype=numpy.uint8) + self._csti8 = numpy.array([], dtype=numpy.int8) + + def _init(self): + self.rtu8_ = QLinearConvUInt8() + self.rti8_ = QLinearConvInt8() + for rt in [self.rtu8_, self.rti8_]: + rt.init(self.auto_pad, + numpy.array(self.dilations, dtype=numpy.int64), + self.group, + numpy.array(self.kernel_shape, dtype=numpy.int64), + numpy.array(self.pads, dtype=numpy.int64), + numpy.array(self.strides, dtype=numpy.int64)) + + def _run(self, X, x_scale, x_zero_point, w, w_scale, w_zero_point, # pylint: disable=W0221 + y_scale, y_zero_point, B=None): + if X is None: + raise ValueError( # pragma: no cover + "X cannot be None for operator %r, ONNX=%r" % ( + type(self), self.onnx_node)) + if X.dtype == numpy.uint8: + return (self.rtu8_.compute( + X, x_scale, x_zero_point, w, w_scale, w_zero_point, # pylint: disable=W0221 + y_scale, y_zero_point, B or self._cstu8), ) + return (self.rti8_.compute( + X, x_scale, x_zero_point, w, w_scale, w_zero_point, # pylint: disable=W0221 + y_scale, y_zero_point, B or self._csti8), ) + + def _infer_shapes(self, X, x_scale, x_zero_point, w, w_scale, # pylint: disable=W0221 + w_zero_point, y_scale, y_zero_point, B=None): + + return (ShapeObject(None, dtype=X.dtype), ) diff --git a/mlprodict/onnxrt/ops_cpu/op_qlinear_conv_.cpp b/mlprodict/onnxrt/ops_cpu/op_qlinear_conv_.cpp new file mode 100644 index 000000000..2e905043e --- /dev/null +++ b/mlprodict/onnxrt/ops_cpu/op_qlinear_conv_.cpp @@ -0,0 +1,61 @@ +// Inspired from +// https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/core/providers/cpu/ml/tree_ensemble_classifier.cc. + +#if !defined(_CRT_SECURE_NO_WARNINGS) +#define _CRT_SECURE_NO_WARNINGS +#endif + + +#include "op_qlinear_conv_.hpp" + + +#ifndef SKIP_PYTHON + + +class QLinearConvInt8 : public QLinearConv { +public: + QLinearConvInt8() : QLinearConv() {} +}; + + +class QLinearConvUInt8 : public QLinearConv { +public: + QLinearConvUInt8() : QLinearConv() {} +}; + + +PYBIND11_MODULE(op_qlinear_conv_, m) { + m.doc() = +#if defined(__APPLE__) + "Implements Conv operator." +#else + R"pbdoc(Implements runtime for operator QLinearConv. The code is inspired from +`conv.cc `_ +in :epkg:`onnxruntime`.)pbdoc" +#endif +; + + py::class_ clf(m, "QLinearConvUInt8", + R"pbdoc(Implements uint8 runtime for operator QLinearConvUInt8. The code is inspired from +`qlinearconv.cc `_ +in :epkg:`onnxruntime`. Supports uint8 only.)pbdoc"); + + clf.def(py::init<>()); + clf.def("init", &QLinearConvUInt8::init, + "Initializes the runtime with the ONNX attributes."); + clf.def("compute", &QLinearConvUInt8::compute, + "Computes the output for operator QLinearConv."); + + py::class_ cld(m, "QLinearConvInt8", + R"pbdoc(Implements int8 runtime for operator QLinearConv. The code is inspired from +`qlinearconv.cc `_ +in :epkg:`onnxruntime`. Supports int8 only.)pbdoc"); + + cld.def(py::init<>()); + cld.def("init", &QLinearConvInt8::init, + "Initializes the runtime with the ONNX attributes."); + cld.def("compute", &QLinearConvInt8::compute, + "Computes the output for operator QLinearConv."); +} + +#endif diff --git a/mlprodict/onnxrt/ops_cpu/op_qlinear_conv_.hpp b/mlprodict/onnxrt/ops_cpu/op_qlinear_conv_.hpp new file mode 100644 index 000000000..2d745f899 --- /dev/null +++ b/mlprodict/onnxrt/ops_cpu/op_qlinear_conv_.hpp @@ -0,0 +1,571 @@ +#pragma once + +// Inspired from +// https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/core/providers/cpu/ml/tree_ensemble_classifier.cc. + +#if !defined(_CRT_SECURE_NO_WARNINGS) +#define _CRT_SECURE_NO_WARNINGS +#endif + +#include "op_conv_matrices_.hpp" + +#define ROUNDING_BIAS_MAGIC 12582912.f +#define ROUNDING_BIAS_MAGIC_BITS 0x4B400000 + + +template +inline bool is_signed() { return true; } + +template <> +inline bool is_signed() { return false; } + + +inline uint32_t BitsOfFp32(float FloatValue) { + union { + uint32_t IntegerValue; + float FloatValue; + } u; + u.FloatValue = FloatValue; + return u.IntegerValue; +} + + + +/** +* This routine requantizes the intermediate buffer to the output buffer +* optionally adding the supplied bias. +* Parameters: +* * Input - Supplies the input matrix. +* * Output - Supplies the output matrix. +* * Bias - Supplies the optional bias vector to be added to the input buffer +* before requantization. +* * Buffer - Supplies the output matrix. +* * M - Supplies the number of elements of the bias vector and the number of +* rows in the output matrix. +* * N - Supplies the number of columns of the output matrix. +* * Scale - Supplies the quantization scale. +* * PerColumnScale - Supplies true if the quantization scale has per-column +* values, else false if a single quantization scale applies to the +* entire matrix. +* * ZeroPoint - Supplies the quantization zero point value. +*/ +template +void RequantizeOutput( + const int32_t* Input, + T* Output, + const int32_t* Bias, + size_t M, + size_t N, + const float* Scale, + bool PerColumnScale, + T ZeroPoint) { + const float PerMatrixScaleValue = PerColumnScale ? 0.0f : *Scale; + const float MinimumValue = float(0 - ZeroPoint); + const float MaximumValue = float(255 - ZeroPoint); + + // + // Step through each row of the output matrix. + // + + while (M-- > 0) { + + const int32_t* bias = Bias; + const float* scale = Scale; + int32_t IntegerValue; + size_t n = N; + + while (n > 0) { + + IntegerValue = *Input++; + if (bias != nullptr) + IntegerValue += *bias++; + + float FloatValue = float(IntegerValue); + float ScaleValue = PerColumnScale ? *scale++ : PerMatrixScaleValue; + + FloatValue *= ScaleValue; + FloatValue = std::max(FloatValue, MinimumValue); + FloatValue = std::min(FloatValue, MaximumValue); + + // + // Use the fast rounding trick adapted from XNNPACK: bias the floating + // point value by the first floating point value that has no + // fractional bits. The add operation performs the "round to nearest + // even". Extract the mantissa bits from this floating point value to + // obtain the rounded integer value. + // + + IntegerValue = int32_t(BitsOfFp32(FloatValue + ROUNDING_BIAS_MAGIC)) - ROUNDING_BIAS_MAGIC_BITS; + *Output++ = T(IntegerValue + ZeroPoint); + n -= 1; + } + } +} + + +template +class QLinearConv : public ConvPoolCommon { + +private: + + std::vector W_shape_; + T* packed_W_buffer_; + size_t packed_W_size_; + T* reordered_W_buffer_; + bool is_W_signed_; + bool is_W_packed_; + bool channels_last_; + +public: + + typedef const T* T_const_ptr; + + QLinearConv() : ConvPoolCommon() {} + + void init( + const std::string& auto_pad, + py_array_t dilations, + int64_t group, + py_array_t kernel_shape, + py_array_t pads, + py_array_t strides) { + ConvPoolCommon::init(auto_pad, dilations, group, kernel_shape, pads, strides); + is_W_signed_ = is_signed(); + channels_last_ = false; + reordered_W_buffer_ = nullptr; + packed_W_buffer_ = nullptr; + is_W_packed_ = false; + packed_W_size_ = 0; + } + + py_array_t compute( + py_array_t X, + float x_scale, T x_zero_point, + py_array_t W, + py_array_t w_scale, + T w_zero_point, + float y_scale, T y_zero_point, + py_array_t B) const { + + std::vector x_dims; + arrayshape2vector(x_dims, X); + std::vector w_dims; + arrayshape2vector(w_dims, W); + + const int64_t N = x_dims[0]; + const int64_t M = w_dims[0]; + const auto& W_shape = M > 0 ? w_dims : W_shape_; + const bool is_W_signed = is_W_signed_; + //const Tensor* W = is_W_packed_ ? nullptr : context->Input(3); + + std::vector output_scales; + std::vector W_scale_shape; + arrayshape2vector(W_scale_shape, w_scale); + const int64_t W_scale_size = flattened_dimension(W_scale_shape); + const float* W_scale_data = w_scale.data(); + output_scales.resize(static_cast(W_scale_size)); + for (int64_t i = 0; i < W_scale_size; i++) + output_scales[i] = (x_scale * W_scale_data[i] / y_scale); + + std::vector kernel_shape; + compute_kernel_shape(w_dims, kernel_shape); + const size_t kernel_rank = kernel_shape.size(); + std::vector pads(pads_); + if (pads.empty()) + pads.resize(kernel_rank * 2, 0); + std::vector dilations(dilations_); + if (dilations.empty()) + dilations.resize(kernel_rank, 1); + std::vector strides(strides_); + if (strides.empty()) + strides.resize(kernel_rank, 1); + + const int64_t C = x_dims[channels_last_ ? 1 + kernel_rank : 1]; + const size_t spatial_dim_start = channels_last_ ? 1 : 2; + const size_t spatial_dim_end = spatial_dim_start + kernel_rank; + std::vector y_dims({ N }); + if (!channels_last_) + y_dims.push_back(M); + + std::vector sliced_input_shape(spatial_dim_end - spatial_dim_start); + for (size_t i = 0; i < sliced_input_shape.size(); ++i) + sliced_input_shape[i] = x_dims[i + spatial_dim_start]; + + infer_output_shape(sliced_input_shape, kernel_shape, strides, dilations, pads, y_dims, false); + if (channels_last_) + y_dims.push_back(M); + + std::vector output_shape(y_dims.begin() + spatial_dim_start, y_dims.begin() + spatial_dim_end); + py_array_t Y(y_dims); + { + py_gil_scoped_release release; + compute_gil_free(X, W, B, Y, + sliced_input_shape, output_shape, + kernel_shape, pads, dilations, strides, + x_dims, y_dims, w_dims, + x_scale, x_zero_point, + w_scale, w_zero_point, + y_scale, y_zero_point, + output_scales); + } + return Y; + } + +private: + + bool HasStridesOneAndNoPadding() const { + if (std::all_of(strides_.begin(), strides_.end(), [](int64_t v) { return v == 1; })) { + if (std::all_of(pads_.begin(), pads_.end(), [](int64_t v) { return v == 0; })) + return true; + } + return false; + } + + void ReorderFilter( + const T* input, T* output, + size_t output_channels, + size_t input_channels, + size_t kernel_size) const { + for (size_t k = 0; k < kernel_size; k++) { + for (size_t ic = 0; ic < input_channels; ic++) { + for (size_t oc = 0; oc < output_channels; oc++) { + size_t index = (oc * input_channels * kernel_size) + (ic * kernel_size) + k; + *output++ = input[index]; + } + } + } + } + + + void compute_gil_free( + py_array_t X, py_array_t W, + py_array_t B, py_array_t& Y, + const std::vector& input_shape, + const std::vector& output_shape, + const std::vector& kernel_shape, + const std::vector& pads, + const std::vector& dilations, + const std::vector& strides, + const std::vector& x_dims, + const std::vector& y_dims, + const std::vector& w_dims, + float x_scale, T x_zero_point, + py_array_t w_scale, T w_zero_point, + float y_scale, T y_zero_point, + const std::vector& output_scales) const { + + // see https://github.com/microsoft/onnxruntime/pull/7885/ + std::vector b_dims; + arrayshape2vector(b_dims, B); + + const int64_t N = x_dims[0]; + const int64_t C = x_dims[1]; + const int64_t M = w_dims[0]; + + const int64_t input_image_size = flattened_dimension(input_shape); + const int64_t output_image_size = flattened_dimension(output_shape); + const int64_t kernel_size = flattened_dimension(kernel_shape); + + // Handle the case of a dynamic weight filter. + T* reordered_W_buffer = nullptr; + T* reordered_W = nullptr; + if (!packed_W_buffer_) { + if (w_dims.empty()) { + // Weight was constant and reordered. + reordered_W = reordered_W_buffer_; + } + else { + // Weight tensor was not constant or prepacking is disabled. + reordered_W = new T[flattened_dimension(w_dims)]; + reordered_W_buffer = reordered_W; + ReorderFilter(W.data(), + reordered_W, + static_cast(M), + static_cast(w_dims[1]), + static_cast(kernel_size)); + } + } + + int64_t group_count = group_; + int64_t group_input_channels = w_dims[1]; + int64_t group_output_channels = M / group_count; + + // Test for depthwise convolution. + const bool is_depthwise_conv = (reordered_W != nullptr && group_input_channels == 1 && group_output_channels == 1); + if (is_depthwise_conv) { + // Update the input and output channels to the number of groups in order to + // reuse as much of the below standard convolution path. + group_input_channels = group_count; + group_output_channels = group_count; + group_count = 1; + } + + const int64_t X_offset = C * input_image_size; + const int64_t Y_offset = M * output_image_size; + const int64_t kernel_dim = group_input_channels * kernel_size; + const int64_t col_buffer_size = kernel_dim * output_image_size; + + // Use an intermediate int32_t buffer for the GEMM computation before + // requantizing to the output type. + int32_t* gemm_output_data = new int32_t[Y_offset]; + int32_t* gemm_output_buffer = gemm_output_data; + int32_t* gemm_output = gemm_output_data; + + const T* Xdata = X.data(); + const int32_t* Bdata = (int32_t*)(b_dims.size() > 0 ? B.data() : nullptr); + T* Ydata = (T*)Y.data(); + + T* transpose_input_buffer = nullptr; + T* transpose_output_buffer = nullptr; + T* transpose_input = nullptr; + T* transpose_output = nullptr; + if (!channels_last_) { + transpose_input = new T[X_offset]; + transpose_input_buffer = transpose_input; + transpose_output = new T[Y_offset]; + transpose_output_buffer = transpose_output; + } + + T_const_ptr* col_buffer = nullptr; + T_const_ptr* col_data = nullptr; + std::vector padding_data; + const size_t kernel_rank = kernel_shape.size(); + + if (is_depthwise_conv) { + // Allocate indirection buffer pointers and prepare a padding vector for + // the im2col transform. + col_data = new T_const_ptr[kernel_size * output_image_size]; + col_buffer = col_data; + padding_data.resize(static_cast(C), x_zero_point); + } + else if (kernel_size != 1 || !HasStridesOneAndNoPadding()) { + // Pointwise convolutions can use the original input tensor in place, + // otherwise a temporary buffer is required for the im2col transform. + int64_t group_col_buffer_size = (kernel_rank > 2) ? group_count * col_buffer_size : col_buffer_size; + col_data = new T_const_ptr[group_col_buffer_size]; + col_buffer = col_data; + } + + // See onnxruntime. + int32_t maximum_thread_count = 16; + constexpr double thread_complexity = static_cast(64 * 1024); + const double complexity = static_cast(output_image_size) * + static_cast(group_output_channels) * + static_cast(kernel_dim); + + + // OMP +#if false //USE_OPENMP + int32_t thread_count = maximum_thread_count; + if (complexity < thread_complexity * maximum_thread_count) + thread_count = static_cast(complexity / thread_complexity) + 1; + // Ensure that every thread produces at least one output. + if (thread_count > output_image_size) + thread_count = static_cast(output_image_size); + thread_count = std::min(thread_count, ::omp_get_max_threads()); +#else + int32_t thread_count = 1; +#endif + + for (int64_t image_id = 0; image_id < N; ++image_id) { + const T* input_data = Xdata; + T* output_data = Ydata; + + if (!channels_last_) { + // Transpose the input from channels first (NCHW) to channels last (NHWC). + TensorTranspose( + Xdata, + transpose_input_buffer, + static_cast(C), + static_cast(input_image_size)); + input_data = transpose_input_buffer; + output_data = transpose_output_buffer; + } + + // Threaded implementation of ND convolution is not yet supported, so + // prepare all im2col transformations here. + if (!is_depthwise_conv && col_buffer && kernel_rank > 2) { + for (int64_t group_id = 0; group_id < group_count; ++group_id) { + Im2col_NCHW( + input_data + group_id * group_input_channels, + group_input_channels, + C, + input_shape.data(), + output_shape.data(), + kernel_shape.data(), + strides.data(), + dilations.data(), + pads.data(), + static_cast(kernel_rank), + ((T*)col_buffer) + group_id * col_buffer_size, // sizeof(T*) > sizeof(T) + x_zero_point); + } + } + +#if false // USE_OPENMP +#pragma omp parallel for +#endif + for (int32_t batch_idx = 0; batch_idx < thread_count; ++batch_idx) { + int64_t output_start, output_end; + std::ptrdiff_t work_per_batch = output_image_size / thread_count; + std::ptrdiff_t work_per_batch_extra = output_image_size % thread_count; + if (batch_idx < work_per_batch_extra) { + output_start = (work_per_batch + 1) * batch_idx; + output_end = output_start + work_per_batch + 1; + } + else { + output_start = work_per_batch * batch_idx + work_per_batch_extra; + output_end = output_start + work_per_batch; + } + int64_t output_count = output_end - output_start; + + int32_t* worker_gemm_output = gemm_output + output_start * M; + T* worker_requantize_output = output_data + output_start * M; + + if (is_depthwise_conv) { + T_const_ptr* worker_col_buffer = col_buffer + output_start * kernel_size; + Im2col_NHWC( + input_data, + C, + input_shape.data(), + output_shape.data(), + kernel_shape.data(), + strides.data(), + dilations.data(), + pads.data(), + static_cast(kernel_rank), + output_start, + output_count, + worker_col_buffer, + padding_data.data()); + QConvDepthwise( + worker_col_buffer, + x_zero_point, + reordered_W, + w_zero_point, + is_signed(), + worker_gemm_output, + static_cast(M), + static_cast(output_count), + static_cast(kernel_size)); + } + else { + for (int64_t group_id = 0; group_id < group_count; ++group_id) { + // Prepare the im2col transformation or use the input buffer directly for + // pointwise convolutions. + const T* worker_gemm_input; + if (col_buffer) { + T* worker_col_buffer = ((T*)col_buffer) + output_start * kernel_dim; + if (kernel_rank == 2) { + Im2col_NHWC( + input_data + group_id * group_input_channels, + group_input_channels, + C, + input_shape[0], + input_shape[1], + kernel_shape[0], + kernel_shape[1], + dilations[0], + dilations[1], + pads[0], + pads[1], + strides[0], + strides[1], + output_shape[1], + output_start, + output_count, + worker_col_buffer, + x_zero_point); + } + else if (kernel_rank == 1) { + Im2col_NHWC( + input_data + group_id * group_input_channels, + group_input_channels, + C, + 1, + input_shape[0], + 1, + kernel_shape[0], + 1, + dilations[0], + 0, + pads[0], + 1, + strides[0], + output_shape[0], + output_start, + output_count, + worker_col_buffer, + x_zero_point); + } + else { + // Use the im2col buffer prepared outside the thread, indexed by group. + worker_col_buffer += group_id * col_buffer_size; + } + worker_gemm_input = worker_col_buffer; + } + else { + worker_gemm_input = input_data + output_start * kernel_dim; + } + + size_t ldb = 0; + const T* ptrB; + bool BIsPacked = false; + if (packed_W_buffer_) { + ptrB = static_cast(packed_W_buffer_) + group_id * packed_W_size_, + BIsPacked = true; + } + else { + ptrB = reordered_W + group_id * group_output_channels, + ldb = static_cast(M); + } + + QGemm( + false, false, + static_cast(output_count), // M + static_cast(group_output_channels), // N + static_cast(kernel_dim), 1, // K, alpha + worker_gemm_input, ptrB, 1, // A, B, beta + worker_gemm_output + group_id * group_output_channels, // C + static_cast(kernel_dim), ldb, static_cast(M), // lda, ldb, ldc + x_zero_point, &w_zero_point, // ZeroPointA, ZeroPointB + BIsPacked, false); // BIsPacked, PerColumnZeroPoints + } + } + + RequantizeOutput( + worker_gemm_output, + worker_requantize_output, + Bdata, + static_cast(output_count), + static_cast(M), + output_scales.data(), + output_scales.size() > 1, + y_zero_point); + } + + if (!channels_last_) { + // Transpose the output from channels last (NHWC) to channels first (NCHW). + TensorTranspose( + output_data, + Ydata, + static_cast(output_image_size), + static_cast(M)); + } + + Xdata += X_offset; + Ydata += Y_offset; + + } + + if (transpose_input != nullptr) + delete[] transpose_input; + if (transpose_output != nullptr) + delete[] transpose_output; + if (reordered_W_buffer != nullptr) + delete[] reordered_W_buffer; + if (col_data != nullptr) + delete[] col_data; + } +}; + diff --git a/mlprodict/onnxrt/ops_cpu/op_where.py b/mlprodict/onnxrt/ops_cpu/op_where.py index e7e20af2a..0b8c6f9dd 100644 --- a/mlprodict/onnxrt/ops_cpu/op_where.py +++ b/mlprodict/onnxrt/ops_cpu/op_where.py @@ -19,7 +19,7 @@ def _run(self, condition, x, y): # pylint: disable=W0221 raise RuntimeError( # pragma: no cover "x and y should share the same dtype {} != {}".format( x.dtype, y.dtype)) - if x.shape != y.shape: + if x.shape != y.shape and x.shape != (1, ) and y.shape != (1, ): raise RuntimeError( # pragma: no cover "x and y should share the same shape {} != {}".format( x.shape, y.shape)) diff --git a/setup.py b/setup.py index 31c2fc4b4..9dd2ace6e 100644 --- a/setup.py +++ b/setup.py @@ -95,6 +95,7 @@ def get_extensions(): ext_max_pool = Extension( 'mlprodict.onnxrt.ops_cpu.op_max_pool_', [os.path.join(root, 'mlprodict/onnxrt/ops_cpu/op_max_pool_.cpp'), + os.path.join(root, 'mlprodict/onnxrt/ops_cpu/op_conv_matrices_.cpp'), os.path.join(root, 'mlprodict/onnxrt/ops_cpu/op_common_.cpp'), os.path.join(root, 'mlprodict/onnxrt/ops_cpu/op_common_num_.cpp')], extra_compile_args=extra_compile_args, @@ -255,6 +256,7 @@ def get_extensions(): ext_conv = Extension( 'mlprodict.onnxrt.ops_cpu.op_conv_', [os.path.join(root, 'mlprodict/onnxrt/ops_cpu/op_conv_.cpp'), + os.path.join(root, 'mlprodict/onnxrt/ops_cpu/op_conv_matrices_.cpp'), os.path.join(root, 'mlprodict/onnxrt/ops_cpu/op_common_.cpp')], extra_compile_args=extra_compile_args, extra_link_args=extra_link_args, @@ -267,9 +269,27 @@ def get_extensions(): define_macros=define_macros, language='c++') + ext_qlinearconv = Extension( + 'mlprodict.onnxrt.ops_cpu.op_qlinear_conv_', + [os.path.join(root, 'mlprodict/onnxrt/ops_cpu/op_qlinear_conv_.cpp'), + os.path.join(root, 'mlprodict/onnxrt/ops_cpu/op_conv_matrices_.cpp'), + os.path.join(root, 'mlprodict/onnxrt/ops_cpu/op_common_.cpp') + ], + extra_compile_args=extra_compile_args, + extra_link_args=extra_link_args, + include_dirs=[ + # Path to pybind11 headers + get_pybind_include(), + get_pybind_include(user=True), + os.path.join(root, 'mlprodict/onnxrt/ops_cpu') + ], + define_macros=define_macros, + language='c++') + ext_conv_transpose = Extension( 'mlprodict.onnxrt.ops_cpu.op_conv_transpose_', [os.path.join(root, 'mlprodict/onnxrt/ops_cpu/op_conv_transpose_.cpp'), + os.path.join(root, 'mlprodict/onnxrt/ops_cpu/op_conv_matrices_.cpp'), os.path.join(root, 'mlprodict/onnxrt/ops_cpu/op_common_.cpp')], extra_compile_args=extra_compile_args, extra_link_args=extra_link_args, @@ -316,6 +336,7 @@ def get_extensions(): ext_experimental_c, ext_gather, ext_max_pool, + ext_qlinearconv, ext_svm_classifier, ext_svm_regressor, ext_tfidfvectorizer,