Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Quant][Inductor] Enable quantization dynamic batch size support #108550

Conversation

leslie-fang-intel
Copy link
Collaborator

@leslie-fang-intel leslie-fang-intel commented Sep 5, 2023

Stack from ghstack (oldest at bottom):

Summary
This Diff enables dynamic batch size support for quantization use case in Inductor. Take the UT in this PR as example, after this PR, the generated code will have assumption of dynamic input batch size.

cpp_fused_quantize_per_tensor_0 = async_compile.cpp('''
#include "/tmp/torchinductor_root/ib/cibrnuq56cxamjj4krp4zpjvsirbmlolpbnmomodzyd46huzhdw7.h"
extern "C" void kernel(const float* in_ptr0,
                       unsigned char* out_ptr0,
                       const long ks0,
                       const long ks1)
{
    {
        #pragma GCC ivdep
        for(long i0=static_cast<long>(0L); i0<static_cast<long>(ks0); i0+=static_cast<long>(1L))
        {
            #pragma GCC ivdep
            for(long i1=static_cast<long>(0L); i1<static_cast<long>(3L); i1+=static_cast<long>(1L))
            {
                #pragma GCC ivdep
                for(long i2=static_cast<long>(0L); i2<static_cast<long>(static_cast<long>(ks1*ks1)); i2+=static_cast<long>(1L))
                {
                    auto tmp0 = in_ptr0[static_cast<long>(i2 + (i1*(static_cast<long>(ks1*ks1))) + (3L*i0*(static_cast<long>(ks1*ks1))))];
                    auto tmp1 = static_cast<float>(40.36037717834931);
                    auto tmp2 = decltype(tmp0)(tmp0 * tmp1);
                    auto tmp3 = std::nearbyint(tmp2);
                    auto tmp4 = static_cast<float>(97.0);
                    auto tmp5 = tmp3 + tmp4;
                    auto tmp6 = static_cast<float>(0.0);
                    auto tmp7 = max_propagate_nan(tmp5, tmp6);
                    auto tmp8 = static_cast<float>(255.0);
                    auto tmp9 = min_propagate_nan(tmp7, tmp8);
                    auto tmp10 = static_cast<unsigned char>(tmp9);
                    out_ptr0[static_cast<long>(i1 + (3L*i2) + (3L*i0*(static_cast<long>(ks1*ks1))))] = tmp10;
                }
            }
        }
    }
}
''')


cpp_fused_dequantize_per_tensor_mean_quantize_per_tensor_1 = async_compile.cpp('''
#include "/tmp/torchinductor_root/ib/cibrnuq56cxamjj4krp4zpjvsirbmlolpbnmomodzyd46huzhdw7.h"
extern "C" void kernel(const unsigned char* in_ptr0,
                       float* out_ptr0,
                       unsigned char* out_ptr1,
                       const long ks0,
                       const long ks1)
{
    {
        #pragma GCC ivdep
        for(long i0=static_cast<long>(0L); i0<static_cast<long>(ks0); i0+=static_cast<long>(1L))
        {
            for(long i1=static_cast<long>(0L); i1<static_cast<long>(16L); i1+=static_cast<long>(16L))
            {
                {
                    #pragma omp declare reduction(+:at::vec::Vectorized<float>:omp_out = omp_out + omp_in) initializer(omp_priv={at::vec::Vectorized<float>(0)})
                    float tmp_acc0 = 0;
                    at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);
                    for(long i2=static_cast<long>(0L); i2<static_cast<long>(1L + (static_cast<long>((at::native::div_floor_integer(ks1, 2L))*(at::native::div_floor_integer(ks1, 2L)))) + (2L*(at::native::div_floor_integer(ks1, 2L)))); i2+=static_cast<long>(1L))
                    {
                        auto tmp0 = at::vec::Vectorized<uint8_t>::loadu_one_fourth(in_ptr0 + static_cast<long>(i1 + (16L*i0) + (16L*i2) + (16L*i0*(static_cast<long>((at::native::div_floor_integer(ks1, 2L))*(at::native::div_floor_integer(ks1, 2L))))) + (32L*i0*(at::native::div_floor_integer(ks1, 2L)))));
                        auto tmp1 = at::vec::convert_uint8_to_float(tmp0);
                        auto tmp2 = at::vec::Vectorized<float>(static_cast<float>(0.0));
                        auto tmp3 = tmp1 - tmp2;
                        auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(0.010429476387798786));
                        auto tmp5 = tmp3 * tmp4;
                        tmp_acc0_vec = tmp_acc0_vec + tmp5;
                    }
                    tmp_acc0_vec.store(out_ptr0 + static_cast<long>(i1 + (16L*i0)));
                }
            }
        }
    }
    {
        #pragma GCC ivdep
        for(long i0=static_cast<long>(0L); i0<static_cast<long>(16L*ks0); i0+=static_cast<long>(1L))
        {
            auto tmp0 = out_ptr0[static_cast<long>(i0)];
            auto tmp1 = static_cast<float>(1L + (static_cast<long>((at::native::div_floor_integer(ks1, 2L))*(at::native::div_floor_integer(ks1, 2L)))) + (2L*(at::native::div_floor_integer(ks1, 2L))));
            auto tmp2 = tmp0 / tmp1;
            auto tmp3 = static_cast<float>(168.09128392896545);
            auto tmp4 = decltype(tmp2)(tmp2 * tmp3);
            auto tmp5 = std::nearbyint(tmp4);
            auto tmp6 = static_cast<float>(0.0);
            auto tmp7 = tmp5 + tmp6;
            auto tmp8 = max_propagate_nan(tmp7, tmp6);
            auto tmp9 = static_cast<float>(255.0);
            auto tmp10 = min_propagate_nan(tmp8, tmp9);
            auto tmp11 = static_cast<unsigned char>(tmp10);
            out_ptr1[static_cast<long>(i0)] = tmp11;
        }
    }
}
''')


cpp_fused_dequantize_per_tensor_2 = async_compile.cpp('''
#include "/tmp/torchinductor_root/ib/cibrnuq56cxamjj4krp4zpjvsirbmlolpbnmomodzyd46huzhdw7.h"
extern "C" void kernel(const unsigned char* in_ptr0,
                       float* out_ptr0,
                       const long ks0)
{
    {
        for(long i0=static_cast<long>(0L); i0<static_cast<long>(16L*ks0); i0+=static_cast<long>(16L))
        {
            auto tmp0 = at::vec::Vectorized<uint8_t>::loadu_one_fourth(in_ptr0 + static_cast<long>(i0));
            auto tmp1 = at::vec::convert_uint8_to_float(tmp0);
            auto tmp2 = at::vec::Vectorized<float>(static_cast<float>(100.0));
            auto tmp3 = tmp1 - tmp2;
            auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(0.0056716203689575195));
            auto tmp5 = tmp3 * tmp4;
            tmp5.store(out_ptr0 + static_cast<long>(i0));
        }
    }
}
''')


async_compile.wait(globals())
del async_compile

def call(args):
    arg8_1, arg9_1, arg10_1 = args
    args.clear()
    s0 = arg8_1
    s2 = arg9_1
    assert_size_stride(arg10_1, (s0, 3, s2, s2), (3*(s2*s2), s2*s2, s2, 1))
    buf0 = empty_strided((s0, 3, s2, s2), (3*(s2*s2), 1, 3*s2, 3), device='cpu', dtype=torch.uint8)
    cpp_fused_quantize_per_tensor_0(c_void_p(arg10_1.data_ptr()), c_void_p(buf0.data_ptr()), c_long(s0), c_long(s2))
    del arg10_1
    buf1 = torch.ops.onednn.qconv2d_pointwise(buf0, 0.024776775389909744, 97, constant5, constant2, constant3, constant0, [1, 1], [1, 1], [1, 1], 1, 95.88209060714476, 0, False, 'relu', [], '')
    assert_size_stride(buf1, (s0, 16, 1 + s2, 1 + s2), (16 + (16*(s2*s2)) + (32*s2), 1, 16 + (16*s2), 16))
    del buf0
    # Source Nodes: [quantize_per_tensor_default_2], Original ATen: [quantized_decomposed.quantize_per_tensor]
    buf2 = torch.ops.quantized.max_pool2d(buf1, [3, 3], [2, 2], [1, 1], [1, 1], False)
    del buf1
    buf3 = buf2
    assert_size_stride(buf3, (s0, 16, 1 + (s2 // 2), 1 + (s2 // 2)), (16 + (16*((s2 // 2)*(s2 // 2))) + (32*(s2 // 2)), 1, 16 + (16*(s2 // 2)), 16))
    del buf2
    buf4 = empty_strided((s0, 16, 1, 1), (16, 1, 16*s0, 16*s0), device='cpu', dtype=torch.float32)
    buf5 = empty_strided((s0, 16), (16, 1), device='cpu', dtype=torch.uint8)
    cpp_fused_dequantize_per_tensor_mean_quantize_per_tensor_1(c_void_p(buf3.data_ptr()), c_void_p(buf4.data_ptr()), c_void_p(buf5.data_ptr()), c_long(s0), c_long(s2))
    del buf3
    buf6 = torch.ops.onednn.qlinear_pointwise(buf5, 0.005949148442596197, 0, constant6, constant4, constant3, constant1, 176.31645543014483, 100, False, 'none', [], '')
    assert_size_stride(buf6, (s0, 16), (16, 1))
    del buf5
    buf7 = reinterpret_tensor(buf4, (s0, 16), (16, 1)); del buf4  # reuse
    cpp_fused_dequantize_per_tensor_2(c_void_p(buf6.data_ptr()), c_void_p(buf7.data_ptr()), c_long(s0))
    return (buf7, )

TestPlan

python -m pytest test_mkldnn_pattern_matcher.py -k test_qconv2d_maxpool2d_linear_dynamic

cc @voznesenskym @penguinwu @EikanWang @jgong5 @Guobing-Chen @XiaobingSuper @zhuhaozhe @blzheng @Xia-Weiwen @wenzhe-nrv @jiayisunx @peterbell10 @ipiszy @ngimel @yf225 @chenyang78 @kadeng @muchulee8 @aakhundov

@pytorch-bot
Copy link

pytorch-bot bot commented Sep 5, 2023

🔗 Helpful Links

🧪 See artifacts and rendered test results at hud.pytorch.org/pr/108550

Note: Links to docs will display an error until the docs builds have been completed.

✅ No Failures

As of commit 139d783 with merge base dbddf18 (image):
💚 Looks good so far! There are no failures yet. 💚

This comment was automatically generated by Dr. CI and updates every 15 minutes.

leslie-fang-intel added a commit that referenced this pull request Sep 5, 2023
ghstack-source-id: 9f438beef84f598a7a87a65154962ebcaa9b141c
Pull Request resolved: #108550
@leslie-fang-intel leslie-fang-intel marked this pull request as draft September 5, 2023 02:45
@leslie-fang-intel leslie-fang-intel changed the title Enable Quantization dynamic shape support [Draft] [Quant][Inductor] Enable Quantization dynamic shape support Sep 5, 2023
@leslie-fang-intel leslie-fang-intel added the ciflow/trunk Trigger trunk jobs on your pull request label Sep 5, 2023
leslie-fang-intel added a commit that referenced this pull request Sep 5, 2023
ghstack-source-id: 3837eee56aaee5f23c176903d47076fd1b3b6d8b
Pull Request resolved: #108550
…e support"

cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng Xia-Weiwen wenzhe-nrv jiayisunx peterbell10 ipiszy ngimel yf225 chenyang78 kadeng muchulee8 aakhundov

[ghstack-poisoned]
leslie-fang-intel added a commit that referenced this pull request Sep 6, 2023
ghstack-source-id: 1076b0725fe5611249e828a6ffb0aa7e6be2452f
Pull Request resolved: #108550
@leslie-fang-intel leslie-fang-intel changed the title [Draft] [Quant][Inductor] Enable Quantization dynamic shape support [Quant][Inductor] Enable Quantization dynamic shape support Sep 6, 2023
…e support"

cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng Xia-Weiwen wenzhe-nrv jiayisunx peterbell10 ipiszy ngimel yf225 chenyang78 kadeng muchulee8 aakhundov

[ghstack-poisoned]
leslie-fang-intel added a commit that referenced this pull request Sep 6, 2023
ghstack-source-id: 38bf1650725d4988ca7488690c95d11a66d5e106
Pull Request resolved: #108550
**Summary**
This Diff enables dynamic batch size support for quantization use case in Inductor. Take the UT in this PR as example, after this PR, the generated code will have assumption of dynamic input batch size.
```
cpp_fused_quantize_per_tensor_0 = async_compile.cpp('''
#include "/tmp/torchinductor_root/ib/cibrnuq56cxamjj4krp4zpjvsirbmlolpbnmomodzyd46huzhdw7.h"
extern "C" void kernel(const float* in_ptr0,
                       unsigned char* out_ptr0,
                       const long ks0,
                       const long ks1)
{
    {
        #pragma GCC ivdep
        for(long i0=static_cast<long>(0L); i0<static_cast<long>(ks0); i0+=static_cast<long>(1L))
        {
            #pragma GCC ivdep
            for(long i1=static_cast<long>(0L); i1<static_cast<long>(3L); i1+=static_cast<long>(1L))
            {
                #pragma GCC ivdep
                for(long i2=static_cast<long>(0L); i2<static_cast<long>(static_cast<long>(ks1*ks1)); i2+=static_cast<long>(1L))
                {
                    auto tmp0 = in_ptr0[static_cast<long>(i2 + (i1*(static_cast<long>(ks1*ks1))) + (3L*i0*(static_cast<long>(ks1*ks1))))];
                    auto tmp1 = static_cast<float>(40.36037717834931);
                    auto tmp2 = decltype(tmp0)(tmp0 * tmp1);
                    auto tmp3 = std::nearbyint(tmp2);
                    auto tmp4 = static_cast<float>(97.0);
                    auto tmp5 = tmp3 + tmp4;
                    auto tmp6 = static_cast<float>(0.0);
                    auto tmp7 = max_propagate_nan(tmp5, tmp6);
                    auto tmp8 = static_cast<float>(255.0);
                    auto tmp9 = min_propagate_nan(tmp7, tmp8);
                    auto tmp10 = static_cast<unsigned char>(tmp9);
                    out_ptr0[static_cast<long>(i1 + (3L*i2) + (3L*i0*(static_cast<long>(ks1*ks1))))] = tmp10;
                }
            }
        }
    }
}
''')


cpp_fused_dequantize_per_tensor_mean_quantize_per_tensor_1 = async_compile.cpp('''
#include "/tmp/torchinductor_root/ib/cibrnuq56cxamjj4krp4zpjvsirbmlolpbnmomodzyd46huzhdw7.h"
extern "C" void kernel(const unsigned char* in_ptr0,
                       float* out_ptr0,
                       unsigned char* out_ptr1,
                       const long ks0,
                       const long ks1)
{
    {
        #pragma GCC ivdep
        for(long i0=static_cast<long>(0L); i0<static_cast<long>(ks0); i0+=static_cast<long>(1L))
        {
            for(long i1=static_cast<long>(0L); i1<static_cast<long>(16L); i1+=static_cast<long>(16L))
            {
                {
                    #pragma omp declare reduction(+:at::vec::Vectorized<float>:omp_out = omp_out + omp_in) initializer(omp_priv={at::vec::Vectorized<float>(0)})
                    float tmp_acc0 = 0;
                    at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);
                    for(long i2=static_cast<long>(0L); i2<static_cast<long>(1L + (static_cast<long>((at::native::div_floor_integer(ks1, 2L))*(at::native::div_floor_integer(ks1, 2L)))) + (2L*(at::native::div_floor_integer(ks1, 2L)))); i2+=static_cast<long>(1L))
                    {
                        auto tmp0 = at::vec::Vectorized<uint8_t>::loadu_one_fourth(in_ptr0 + static_cast<long>(i1 + (16L*i0) + (16L*i2) + (16L*i0*(static_cast<long>((at::native::div_floor_integer(ks1, 2L))*(at::native::div_floor_integer(ks1, 2L))))) + (32L*i0*(at::native::div_floor_integer(ks1, 2L)))));
                        auto tmp1 = at::vec::convert_uint8_to_float(tmp0);
                        auto tmp2 = at::vec::Vectorized<float>(static_cast<float>(0.0));
                        auto tmp3 = tmp1 - tmp2;
                        auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(0.010429476387798786));
                        auto tmp5 = tmp3 * tmp4;
                        tmp_acc0_vec = tmp_acc0_vec + tmp5;
                    }
                    tmp_acc0_vec.store(out_ptr0 + static_cast<long>(i1 + (16L*i0)));
                }
            }
        }
    }
    {
        #pragma GCC ivdep
        for(long i0=static_cast<long>(0L); i0<static_cast<long>(16L*ks0); i0+=static_cast<long>(1L))
        {
            auto tmp0 = out_ptr0[static_cast<long>(i0)];
            auto tmp1 = static_cast<float>(1L + (static_cast<long>((at::native::div_floor_integer(ks1, 2L))*(at::native::div_floor_integer(ks1, 2L)))) + (2L*(at::native::div_floor_integer(ks1, 2L))));
            auto tmp2 = tmp0 / tmp1;
            auto tmp3 = static_cast<float>(168.09128392896545);
            auto tmp4 = decltype(tmp2)(tmp2 * tmp3);
            auto tmp5 = std::nearbyint(tmp4);
            auto tmp6 = static_cast<float>(0.0);
            auto tmp7 = tmp5 + tmp6;
            auto tmp8 = max_propagate_nan(tmp7, tmp6);
            auto tmp9 = static_cast<float>(255.0);
            auto tmp10 = min_propagate_nan(tmp8, tmp9);
            auto tmp11 = static_cast<unsigned char>(tmp10);
            out_ptr1[static_cast<long>(i0)] = tmp11;
        }
    }
}
''')


cpp_fused_dequantize_per_tensor_2 = async_compile.cpp('''
#include "/tmp/torchinductor_root/ib/cibrnuq56cxamjj4krp4zpjvsirbmlolpbnmomodzyd46huzhdw7.h"
extern "C" void kernel(const unsigned char* in_ptr0,
                       float* out_ptr0,
                       const long ks0)
{
    {
        for(long i0=static_cast<long>(0L); i0<static_cast<long>(16L*ks0); i0+=static_cast<long>(16L))
        {
            auto tmp0 = at::vec::Vectorized<uint8_t>::loadu_one_fourth(in_ptr0 + static_cast<long>(i0));
            auto tmp1 = at::vec::convert_uint8_to_float(tmp0);
            auto tmp2 = at::vec::Vectorized<float>(static_cast<float>(100.0));
            auto tmp3 = tmp1 - tmp2;
            auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(0.0056716203689575195));
            auto tmp5 = tmp3 * tmp4;
            tmp5.store(out_ptr0 + static_cast<long>(i0));
        }
    }
}
''')


async_compile.wait(globals())
del async_compile

def call(args):
    arg8_1, arg9_1, arg10_1 = args
    args.clear()
    s0 = arg8_1
    s2 = arg9_1
    assert_size_stride(arg10_1, (s0, 3, s2, s2), (3*(s2*s2), s2*s2, s2, 1))
    buf0 = empty_strided((s0, 3, s2, s2), (3*(s2*s2), 1, 3*s2, 3), device='cpu', dtype=torch.uint8)
    cpp_fused_quantize_per_tensor_0(c_void_p(arg10_1.data_ptr()), c_void_p(buf0.data_ptr()), c_long(s0), c_long(s2))
    del arg10_1
    buf1 = torch.ops.onednn.qconv2d_pointwise(buf0, 0.024776775389909744, 97, constant5, constant2, constant3, constant0, [1, 1], [1, 1], [1, 1], 1, 95.88209060714476, 0, False, 'relu', [], '')
    assert_size_stride(buf1, (s0, 16, 1 + s2, 1 + s2), (16 + (16*(s2*s2)) + (32*s2), 1, 16 + (16*s2), 16))
    del buf0
    # Source Nodes: [quantize_per_tensor_default_2], Original ATen: [quantized_decomposed.quantize_per_tensor]
    buf2 = torch.ops.quantized.max_pool2d(buf1, [3, 3], [2, 2], [1, 1], [1, 1], False)
    del buf1
    buf3 = buf2
    assert_size_stride(buf3, (s0, 16, 1 + (s2 // 2), 1 + (s2 // 2)), (16 + (16*((s2 // 2)*(s2 // 2))) + (32*(s2 // 2)), 1, 16 + (16*(s2 // 2)), 16))
    del buf2
    buf4 = empty_strided((s0, 16, 1, 1), (16, 1, 16*s0, 16*s0), device='cpu', dtype=torch.float32)
    buf5 = empty_strided((s0, 16), (16, 1), device='cpu', dtype=torch.uint8)
    cpp_fused_dequantize_per_tensor_mean_quantize_per_tensor_1(c_void_p(buf3.data_ptr()), c_void_p(buf4.data_ptr()), c_void_p(buf5.data_ptr()), c_long(s0), c_long(s2))
    del buf3
    buf6 = torch.ops.onednn.qlinear_pointwise(buf5, 0.005949148442596197, 0, constant6, constant4, constant3, constant1, 176.31645543014483, 100, False, 'none', [], '')
    assert_size_stride(buf6, (s0, 16), (16, 1))
    del buf5
    buf7 = reinterpret_tensor(buf4, (s0, 16), (16, 1)); del buf4  # reuse
    cpp_fused_dequantize_per_tensor_2(c_void_p(buf6.data_ptr()), c_void_p(buf7.data_ptr()), c_long(s0))
    return (buf7, )

```

**TestPlan**
```
python -m pytest test_mkldnn_pattern_matcher.py -k test_qconv2d_maxpool2d_linear_dynamic
```

cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng Xia-Weiwen wenzhe-nrv jiayisunx peterbell10 ipiszy ngimel yf225 chenyang78 kadeng muchulee8 aakhundov

[ghstack-poisoned]
leslie-fang-intel added a commit that referenced this pull request Sep 7, 2023
ghstack-source-id: 5ef3c27e1f0e63fe24b710bb0bc273667203fda1
Pull Request resolved: #108550
**Summary**
This Diff enables dynamic batch size support for quantization use case in Inductor. Take the UT in this PR as example, after this PR, the generated code will have assumption of dynamic input batch size.
```
cpp_fused_quantize_per_tensor_0 = async_compile.cpp('''
#include "/tmp/torchinductor_root/ib/cibrnuq56cxamjj4krp4zpjvsirbmlolpbnmomodzyd46huzhdw7.h"
extern "C" void kernel(const float* in_ptr0,
                       unsigned char* out_ptr0,
                       const long ks0,
                       const long ks1)
{
    {
        #pragma GCC ivdep
        for(long i0=static_cast<long>(0L); i0<static_cast<long>(ks0); i0+=static_cast<long>(1L))
        {
            #pragma GCC ivdep
            for(long i1=static_cast<long>(0L); i1<static_cast<long>(3L); i1+=static_cast<long>(1L))
            {
                #pragma GCC ivdep
                for(long i2=static_cast<long>(0L); i2<static_cast<long>(static_cast<long>(ks1*ks1)); i2+=static_cast<long>(1L))
                {
                    auto tmp0 = in_ptr0[static_cast<long>(i2 + (i1*(static_cast<long>(ks1*ks1))) + (3L*i0*(static_cast<long>(ks1*ks1))))];
                    auto tmp1 = static_cast<float>(40.36037717834931);
                    auto tmp2 = decltype(tmp0)(tmp0 * tmp1);
                    auto tmp3 = std::nearbyint(tmp2);
                    auto tmp4 = static_cast<float>(97.0);
                    auto tmp5 = tmp3 + tmp4;
                    auto tmp6 = static_cast<float>(0.0);
                    auto tmp7 = max_propagate_nan(tmp5, tmp6);
                    auto tmp8 = static_cast<float>(255.0);
                    auto tmp9 = min_propagate_nan(tmp7, tmp8);
                    auto tmp10 = static_cast<unsigned char>(tmp9);
                    out_ptr0[static_cast<long>(i1 + (3L*i2) + (3L*i0*(static_cast<long>(ks1*ks1))))] = tmp10;
                }
            }
        }
    }
}
''')


cpp_fused_dequantize_per_tensor_mean_quantize_per_tensor_1 = async_compile.cpp('''
#include "/tmp/torchinductor_root/ib/cibrnuq56cxamjj4krp4zpjvsirbmlolpbnmomodzyd46huzhdw7.h"
extern "C" void kernel(const unsigned char* in_ptr0,
                       float* out_ptr0,
                       unsigned char* out_ptr1,
                       const long ks0,
                       const long ks1)
{
    {
        #pragma GCC ivdep
        for(long i0=static_cast<long>(0L); i0<static_cast<long>(ks0); i0+=static_cast<long>(1L))
        {
            for(long i1=static_cast<long>(0L); i1<static_cast<long>(16L); i1+=static_cast<long>(16L))
            {
                {
                    #pragma omp declare reduction(+:at::vec::Vectorized<float>:omp_out = omp_out + omp_in) initializer(omp_priv={at::vec::Vectorized<float>(0)})
                    float tmp_acc0 = 0;
                    at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);
                    for(long i2=static_cast<long>(0L); i2<static_cast<long>(1L + (static_cast<long>((at::native::div_floor_integer(ks1, 2L))*(at::native::div_floor_integer(ks1, 2L)))) + (2L*(at::native::div_floor_integer(ks1, 2L)))); i2+=static_cast<long>(1L))
                    {
                        auto tmp0 = at::vec::Vectorized<uint8_t>::loadu_one_fourth(in_ptr0 + static_cast<long>(i1 + (16L*i0) + (16L*i2) + (16L*i0*(static_cast<long>((at::native::div_floor_integer(ks1, 2L))*(at::native::div_floor_integer(ks1, 2L))))) + (32L*i0*(at::native::div_floor_integer(ks1, 2L)))));
                        auto tmp1 = at::vec::convert_uint8_to_float(tmp0);
                        auto tmp2 = at::vec::Vectorized<float>(static_cast<float>(0.0));
                        auto tmp3 = tmp1 - tmp2;
                        auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(0.010429476387798786));
                        auto tmp5 = tmp3 * tmp4;
                        tmp_acc0_vec = tmp_acc0_vec + tmp5;
                    }
                    tmp_acc0_vec.store(out_ptr0 + static_cast<long>(i1 + (16L*i0)));
                }
            }
        }
    }
    {
        #pragma GCC ivdep
        for(long i0=static_cast<long>(0L); i0<static_cast<long>(16L*ks0); i0+=static_cast<long>(1L))
        {
            auto tmp0 = out_ptr0[static_cast<long>(i0)];
            auto tmp1 = static_cast<float>(1L + (static_cast<long>((at::native::div_floor_integer(ks1, 2L))*(at::native::div_floor_integer(ks1, 2L)))) + (2L*(at::native::div_floor_integer(ks1, 2L))));
            auto tmp2 = tmp0 / tmp1;
            auto tmp3 = static_cast<float>(168.09128392896545);
            auto tmp4 = decltype(tmp2)(tmp2 * tmp3);
            auto tmp5 = std::nearbyint(tmp4);
            auto tmp6 = static_cast<float>(0.0);
            auto tmp7 = tmp5 + tmp6;
            auto tmp8 = max_propagate_nan(tmp7, tmp6);
            auto tmp9 = static_cast<float>(255.0);
            auto tmp10 = min_propagate_nan(tmp8, tmp9);
            auto tmp11 = static_cast<unsigned char>(tmp10);
            out_ptr1[static_cast<long>(i0)] = tmp11;
        }
    }
}
''')


cpp_fused_dequantize_per_tensor_2 = async_compile.cpp('''
#include "/tmp/torchinductor_root/ib/cibrnuq56cxamjj4krp4zpjvsirbmlolpbnmomodzyd46huzhdw7.h"
extern "C" void kernel(const unsigned char* in_ptr0,
                       float* out_ptr0,
                       const long ks0)
{
    {
        for(long i0=static_cast<long>(0L); i0<static_cast<long>(16L*ks0); i0+=static_cast<long>(16L))
        {
            auto tmp0 = at::vec::Vectorized<uint8_t>::loadu_one_fourth(in_ptr0 + static_cast<long>(i0));
            auto tmp1 = at::vec::convert_uint8_to_float(tmp0);
            auto tmp2 = at::vec::Vectorized<float>(static_cast<float>(100.0));
            auto tmp3 = tmp1 - tmp2;
            auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(0.0056716203689575195));
            auto tmp5 = tmp3 * tmp4;
            tmp5.store(out_ptr0 + static_cast<long>(i0));
        }
    }
}
''')


async_compile.wait(globals())
del async_compile

def call(args):
    arg8_1, arg9_1, arg10_1 = args
    args.clear()
    s0 = arg8_1
    s2 = arg9_1
    assert_size_stride(arg10_1, (s0, 3, s2, s2), (3*(s2*s2), s2*s2, s2, 1))
    buf0 = empty_strided((s0, 3, s2, s2), (3*(s2*s2), 1, 3*s2, 3), device='cpu', dtype=torch.uint8)
    cpp_fused_quantize_per_tensor_0(c_void_p(arg10_1.data_ptr()), c_void_p(buf0.data_ptr()), c_long(s0), c_long(s2))
    del arg10_1
    buf1 = torch.ops.onednn.qconv2d_pointwise(buf0, 0.024776775389909744, 97, constant5, constant2, constant3, constant0, [1, 1], [1, 1], [1, 1], 1, 95.88209060714476, 0, False, 'relu', [], '')
    assert_size_stride(buf1, (s0, 16, 1 + s2, 1 + s2), (16 + (16*(s2*s2)) + (32*s2), 1, 16 + (16*s2), 16))
    del buf0
    # Source Nodes: [quantize_per_tensor_default_2], Original ATen: [quantized_decomposed.quantize_per_tensor]
    buf2 = torch.ops.quantized.max_pool2d(buf1, [3, 3], [2, 2], [1, 1], [1, 1], False)
    del buf1
    buf3 = buf2
    assert_size_stride(buf3, (s0, 16, 1 + (s2 // 2), 1 + (s2 // 2)), (16 + (16*((s2 // 2)*(s2 // 2))) + (32*(s2 // 2)), 1, 16 + (16*(s2 // 2)), 16))
    del buf2
    buf4 = empty_strided((s0, 16, 1, 1), (16, 1, 16*s0, 16*s0), device='cpu', dtype=torch.float32)
    buf5 = empty_strided((s0, 16), (16, 1), device='cpu', dtype=torch.uint8)
    cpp_fused_dequantize_per_tensor_mean_quantize_per_tensor_1(c_void_p(buf3.data_ptr()), c_void_p(buf4.data_ptr()), c_void_p(buf5.data_ptr()), c_long(s0), c_long(s2))
    del buf3
    buf6 = torch.ops.onednn.qlinear_pointwise(buf5, 0.005949148442596197, 0, constant6, constant4, constant3, constant1, 176.31645543014483, 100, False, 'none', [], '')
    assert_size_stride(buf6, (s0, 16), (16, 1))
    del buf5
    buf7 = reinterpret_tensor(buf4, (s0, 16), (16, 1)); del buf4  # reuse
    cpp_fused_dequantize_per_tensor_2(c_void_p(buf6.data_ptr()), c_void_p(buf7.data_ptr()), c_long(s0))
    return (buf7, )

```

**TestPlan**
```
python -m pytest test_mkldnn_pattern_matcher.py -k test_qconv2d_maxpool2d_linear_dynamic
```

cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng Xia-Weiwen wenzhe-nrv jiayisunx peterbell10 ipiszy ngimel yf225 chenyang78 kadeng muchulee8 aakhundov

[ghstack-poisoned]
@leslie-fang-intel leslie-fang-intel changed the title [Quant][Inductor] Enable Quantization dynamic shape support [Quant][Inductor] Enable Quantization dynamic batch size support Sep 7, 2023
@leslie-fang-intel leslie-fang-intel changed the title [Quant][Inductor] Enable Quantization dynamic batch size support [Quant][Inductor] Enable quantization dynamic batch size support Sep 7, 2023
leslie-fang-intel added a commit that referenced this pull request Sep 7, 2023
ghstack-source-id: d94187ebf3c46a356390a67bb723e62404dc6d5b
Pull Request resolved: #108550
…upport"


**Summary**
This Diff enables dynamic batch size support for quantization use case in Inductor. Take the UT in this PR as example, after this PR, the generated code will have assumption of dynamic input batch size.
```
cpp_fused_quantize_per_tensor_0 = async_compile.cpp('''
#include "/tmp/torchinductor_root/ib/cibrnuq56cxamjj4krp4zpjvsirbmlolpbnmomodzyd46huzhdw7.h"
extern "C" void kernel(const float* in_ptr0,
                       unsigned char* out_ptr0,
                       const long ks0,
                       const long ks1)
{
    {
        #pragma GCC ivdep
        for(long i0=static_cast<long>(0L); i0<static_cast<long>(ks0); i0+=static_cast<long>(1L))
        {
            #pragma GCC ivdep
            for(long i1=static_cast<long>(0L); i1<static_cast<long>(3L); i1+=static_cast<long>(1L))
            {
                #pragma GCC ivdep
                for(long i2=static_cast<long>(0L); i2<static_cast<long>(static_cast<long>(ks1*ks1)); i2+=static_cast<long>(1L))
                {
                    auto tmp0 = in_ptr0[static_cast<long>(i2 + (i1*(static_cast<long>(ks1*ks1))) + (3L*i0*(static_cast<long>(ks1*ks1))))];
                    auto tmp1 = static_cast<float>(40.36037717834931);
                    auto tmp2 = decltype(tmp0)(tmp0 * tmp1);
                    auto tmp3 = std::nearbyint(tmp2);
                    auto tmp4 = static_cast<float>(97.0);
                    auto tmp5 = tmp3 + tmp4;
                    auto tmp6 = static_cast<float>(0.0);
                    auto tmp7 = max_propagate_nan(tmp5, tmp6);
                    auto tmp8 = static_cast<float>(255.0);
                    auto tmp9 = min_propagate_nan(tmp7, tmp8);
                    auto tmp10 = static_cast<unsigned char>(tmp9);
                    out_ptr0[static_cast<long>(i1 + (3L*i2) + (3L*i0*(static_cast<long>(ks1*ks1))))] = tmp10;
                }
            }
        }
    }
}
''')


cpp_fused_dequantize_per_tensor_mean_quantize_per_tensor_1 = async_compile.cpp('''
#include "/tmp/torchinductor_root/ib/cibrnuq56cxamjj4krp4zpjvsirbmlolpbnmomodzyd46huzhdw7.h"
extern "C" void kernel(const unsigned char* in_ptr0,
                       float* out_ptr0,
                       unsigned char* out_ptr1,
                       const long ks0,
                       const long ks1)
{
    {
        #pragma GCC ivdep
        for(long i0=static_cast<long>(0L); i0<static_cast<long>(ks0); i0+=static_cast<long>(1L))
        {
            for(long i1=static_cast<long>(0L); i1<static_cast<long>(16L); i1+=static_cast<long>(16L))
            {
                {
                    #pragma omp declare reduction(+:at::vec::Vectorized<float>:omp_out = omp_out + omp_in) initializer(omp_priv={at::vec::Vectorized<float>(0)})
                    float tmp_acc0 = 0;
                    at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);
                    for(long i2=static_cast<long>(0L); i2<static_cast<long>(1L + (static_cast<long>((at::native::div_floor_integer(ks1, 2L))*(at::native::div_floor_integer(ks1, 2L)))) + (2L*(at::native::div_floor_integer(ks1, 2L)))); i2+=static_cast<long>(1L))
                    {
                        auto tmp0 = at::vec::Vectorized<uint8_t>::loadu_one_fourth(in_ptr0 + static_cast<long>(i1 + (16L*i0) + (16L*i2) + (16L*i0*(static_cast<long>((at::native::div_floor_integer(ks1, 2L))*(at::native::div_floor_integer(ks1, 2L))))) + (32L*i0*(at::native::div_floor_integer(ks1, 2L)))));
                        auto tmp1 = at::vec::convert_uint8_to_float(tmp0);
                        auto tmp2 = at::vec::Vectorized<float>(static_cast<float>(0.0));
                        auto tmp3 = tmp1 - tmp2;
                        auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(0.010429476387798786));
                        auto tmp5 = tmp3 * tmp4;
                        tmp_acc0_vec = tmp_acc0_vec + tmp5;
                    }
                    tmp_acc0_vec.store(out_ptr0 + static_cast<long>(i1 + (16L*i0)));
                }
            }
        }
    }
    {
        #pragma GCC ivdep
        for(long i0=static_cast<long>(0L); i0<static_cast<long>(16L*ks0); i0+=static_cast<long>(1L))
        {
            auto tmp0 = out_ptr0[static_cast<long>(i0)];
            auto tmp1 = static_cast<float>(1L + (static_cast<long>((at::native::div_floor_integer(ks1, 2L))*(at::native::div_floor_integer(ks1, 2L)))) + (2L*(at::native::div_floor_integer(ks1, 2L))));
            auto tmp2 = tmp0 / tmp1;
            auto tmp3 = static_cast<float>(168.09128392896545);
            auto tmp4 = decltype(tmp2)(tmp2 * tmp3);
            auto tmp5 = std::nearbyint(tmp4);
            auto tmp6 = static_cast<float>(0.0);
            auto tmp7 = tmp5 + tmp6;
            auto tmp8 = max_propagate_nan(tmp7, tmp6);
            auto tmp9 = static_cast<float>(255.0);
            auto tmp10 = min_propagate_nan(tmp8, tmp9);
            auto tmp11 = static_cast<unsigned char>(tmp10);
            out_ptr1[static_cast<long>(i0)] = tmp11;
        }
    }
}
''')


cpp_fused_dequantize_per_tensor_2 = async_compile.cpp('''
#include "/tmp/torchinductor_root/ib/cibrnuq56cxamjj4krp4zpjvsirbmlolpbnmomodzyd46huzhdw7.h"
extern "C" void kernel(const unsigned char* in_ptr0,
                       float* out_ptr0,
                       const long ks0)
{
    {
        for(long i0=static_cast<long>(0L); i0<static_cast<long>(16L*ks0); i0+=static_cast<long>(16L))
        {
            auto tmp0 = at::vec::Vectorized<uint8_t>::loadu_one_fourth(in_ptr0 + static_cast<long>(i0));
            auto tmp1 = at::vec::convert_uint8_to_float(tmp0);
            auto tmp2 = at::vec::Vectorized<float>(static_cast<float>(100.0));
            auto tmp3 = tmp1 - tmp2;
            auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(0.0056716203689575195));
            auto tmp5 = tmp3 * tmp4;
            tmp5.store(out_ptr0 + static_cast<long>(i0));
        }
    }
}
''')


async_compile.wait(globals())
del async_compile

def call(args):
    arg8_1, arg9_1, arg10_1 = args
    args.clear()
    s0 = arg8_1
    s2 = arg9_1
    assert_size_stride(arg10_1, (s0, 3, s2, s2), (3*(s2*s2), s2*s2, s2, 1))
    buf0 = empty_strided((s0, 3, s2, s2), (3*(s2*s2), 1, 3*s2, 3), device='cpu', dtype=torch.uint8)
    cpp_fused_quantize_per_tensor_0(c_void_p(arg10_1.data_ptr()), c_void_p(buf0.data_ptr()), c_long(s0), c_long(s2))
    del arg10_1
    buf1 = torch.ops.onednn.qconv2d_pointwise(buf0, 0.024776775389909744, 97, constant5, constant2, constant3, constant0, [1, 1], [1, 1], [1, 1], 1, 95.88209060714476, 0, False, 'relu', [], '')
    assert_size_stride(buf1, (s0, 16, 1 + s2, 1 + s2), (16 + (16*(s2*s2)) + (32*s2), 1, 16 + (16*s2), 16))
    del buf0
    # Source Nodes: [quantize_per_tensor_default_2], Original ATen: [quantized_decomposed.quantize_per_tensor]
    buf2 = torch.ops.quantized.max_pool2d(buf1, [3, 3], [2, 2], [1, 1], [1, 1], False)
    del buf1
    buf3 = buf2
    assert_size_stride(buf3, (s0, 16, 1 + (s2 // 2), 1 + (s2 // 2)), (16 + (16*((s2 // 2)*(s2 // 2))) + (32*(s2 // 2)), 1, 16 + (16*(s2 // 2)), 16))
    del buf2
    buf4 = empty_strided((s0, 16, 1, 1), (16, 1, 16*s0, 16*s0), device='cpu', dtype=torch.float32)
    buf5 = empty_strided((s0, 16), (16, 1), device='cpu', dtype=torch.uint8)
    cpp_fused_dequantize_per_tensor_mean_quantize_per_tensor_1(c_void_p(buf3.data_ptr()), c_void_p(buf4.data_ptr()), c_void_p(buf5.data_ptr()), c_long(s0), c_long(s2))
    del buf3
    buf6 = torch.ops.onednn.qlinear_pointwise(buf5, 0.005949148442596197, 0, constant6, constant4, constant3, constant1, 176.31645543014483, 100, False, 'none', [], '')
    assert_size_stride(buf6, (s0, 16), (16, 1))
    del buf5
    buf7 = reinterpret_tensor(buf4, (s0, 16), (16, 1)); del buf4  # reuse
    cpp_fused_dequantize_per_tensor_2(c_void_p(buf6.data_ptr()), c_void_p(buf7.data_ptr()), c_long(s0))
    return (buf7, )

```

**TestPlan**
```
python -m pytest test_mkldnn_pattern_matcher.py -k test_qconv2d_maxpool2d_linear_dynamic
```

cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng Xia-Weiwen wenzhe-nrv jiayisunx peterbell10 ipiszy ngimel yf225 chenyang78 kadeng muchulee8 aakhundov

[ghstack-poisoned]
leslie-fang-intel added a commit that referenced this pull request Sep 7, 2023
ghstack-source-id: fed07eda3c083a9363edca31cbdd052905646257
Pull Request resolved: #108550
leslie-fang-intel added a commit that referenced this pull request Sep 7, 2023
ghstack-source-id: e2dfdcc20e6e69a6ae9c1b221bea84890d15788e
Pull Request resolved: #108550
…upport"


**Summary**
This Diff enables dynamic batch size support for quantization use case in Inductor. Take the UT in this PR as example, after this PR, the generated code will have assumption of dynamic input batch size.
```
cpp_fused_quantize_per_tensor_0 = async_compile.cpp('''
#include "/tmp/torchinductor_root/ib/cibrnuq56cxamjj4krp4zpjvsirbmlolpbnmomodzyd46huzhdw7.h"
extern "C" void kernel(const float* in_ptr0,
                       unsigned char* out_ptr0,
                       const long ks0,
                       const long ks1)
{
    {
        #pragma GCC ivdep
        for(long i0=static_cast<long>(0L); i0<static_cast<long>(ks0); i0+=static_cast<long>(1L))
        {
            #pragma GCC ivdep
            for(long i1=static_cast<long>(0L); i1<static_cast<long>(3L); i1+=static_cast<long>(1L))
            {
                #pragma GCC ivdep
                for(long i2=static_cast<long>(0L); i2<static_cast<long>(static_cast<long>(ks1*ks1)); i2+=static_cast<long>(1L))
                {
                    auto tmp0 = in_ptr0[static_cast<long>(i2 + (i1*(static_cast<long>(ks1*ks1))) + (3L*i0*(static_cast<long>(ks1*ks1))))];
                    auto tmp1 = static_cast<float>(40.36037717834931);
                    auto tmp2 = decltype(tmp0)(tmp0 * tmp1);
                    auto tmp3 = std::nearbyint(tmp2);
                    auto tmp4 = static_cast<float>(97.0);
                    auto tmp5 = tmp3 + tmp4;
                    auto tmp6 = static_cast<float>(0.0);
                    auto tmp7 = max_propagate_nan(tmp5, tmp6);
                    auto tmp8 = static_cast<float>(255.0);
                    auto tmp9 = min_propagate_nan(tmp7, tmp8);
                    auto tmp10 = static_cast<unsigned char>(tmp9);
                    out_ptr0[static_cast<long>(i1 + (3L*i2) + (3L*i0*(static_cast<long>(ks1*ks1))))] = tmp10;
                }
            }
        }
    }
}
''')


cpp_fused_dequantize_per_tensor_mean_quantize_per_tensor_1 = async_compile.cpp('''
#include "/tmp/torchinductor_root/ib/cibrnuq56cxamjj4krp4zpjvsirbmlolpbnmomodzyd46huzhdw7.h"
extern "C" void kernel(const unsigned char* in_ptr0,
                       float* out_ptr0,
                       unsigned char* out_ptr1,
                       const long ks0,
                       const long ks1)
{
    {
        #pragma GCC ivdep
        for(long i0=static_cast<long>(0L); i0<static_cast<long>(ks0); i0+=static_cast<long>(1L))
        {
            for(long i1=static_cast<long>(0L); i1<static_cast<long>(16L); i1+=static_cast<long>(16L))
            {
                {
                    #pragma omp declare reduction(+:at::vec::Vectorized<float>:omp_out = omp_out + omp_in) initializer(omp_priv={at::vec::Vectorized<float>(0)})
                    float tmp_acc0 = 0;
                    at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);
                    for(long i2=static_cast<long>(0L); i2<static_cast<long>(1L + (static_cast<long>((at::native::div_floor_integer(ks1, 2L))*(at::native::div_floor_integer(ks1, 2L)))) + (2L*(at::native::div_floor_integer(ks1, 2L)))); i2+=static_cast<long>(1L))
                    {
                        auto tmp0 = at::vec::Vectorized<uint8_t>::loadu_one_fourth(in_ptr0 + static_cast<long>(i1 + (16L*i0) + (16L*i2) + (16L*i0*(static_cast<long>((at::native::div_floor_integer(ks1, 2L))*(at::native::div_floor_integer(ks1, 2L))))) + (32L*i0*(at::native::div_floor_integer(ks1, 2L)))));
                        auto tmp1 = at::vec::convert_uint8_to_float(tmp0);
                        auto tmp2 = at::vec::Vectorized<float>(static_cast<float>(0.0));
                        auto tmp3 = tmp1 - tmp2;
                        auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(0.010429476387798786));
                        auto tmp5 = tmp3 * tmp4;
                        tmp_acc0_vec = tmp_acc0_vec + tmp5;
                    }
                    tmp_acc0_vec.store(out_ptr0 + static_cast<long>(i1 + (16L*i0)));
                }
            }
        }
    }
    {
        #pragma GCC ivdep
        for(long i0=static_cast<long>(0L); i0<static_cast<long>(16L*ks0); i0+=static_cast<long>(1L))
        {
            auto tmp0 = out_ptr0[static_cast<long>(i0)];
            auto tmp1 = static_cast<float>(1L + (static_cast<long>((at::native::div_floor_integer(ks1, 2L))*(at::native::div_floor_integer(ks1, 2L)))) + (2L*(at::native::div_floor_integer(ks1, 2L))));
            auto tmp2 = tmp0 / tmp1;
            auto tmp3 = static_cast<float>(168.09128392896545);
            auto tmp4 = decltype(tmp2)(tmp2 * tmp3);
            auto tmp5 = std::nearbyint(tmp4);
            auto tmp6 = static_cast<float>(0.0);
            auto tmp7 = tmp5 + tmp6;
            auto tmp8 = max_propagate_nan(tmp7, tmp6);
            auto tmp9 = static_cast<float>(255.0);
            auto tmp10 = min_propagate_nan(tmp8, tmp9);
            auto tmp11 = static_cast<unsigned char>(tmp10);
            out_ptr1[static_cast<long>(i0)] = tmp11;
        }
    }
}
''')


cpp_fused_dequantize_per_tensor_2 = async_compile.cpp('''
#include "/tmp/torchinductor_root/ib/cibrnuq56cxamjj4krp4zpjvsirbmlolpbnmomodzyd46huzhdw7.h"
extern "C" void kernel(const unsigned char* in_ptr0,
                       float* out_ptr0,
                       const long ks0)
{
    {
        for(long i0=static_cast<long>(0L); i0<static_cast<long>(16L*ks0); i0+=static_cast<long>(16L))
        {
            auto tmp0 = at::vec::Vectorized<uint8_t>::loadu_one_fourth(in_ptr0 + static_cast<long>(i0));
            auto tmp1 = at::vec::convert_uint8_to_float(tmp0);
            auto tmp2 = at::vec::Vectorized<float>(static_cast<float>(100.0));
            auto tmp3 = tmp1 - tmp2;
            auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(0.0056716203689575195));
            auto tmp5 = tmp3 * tmp4;
            tmp5.store(out_ptr0 + static_cast<long>(i0));
        }
    }
}
''')


async_compile.wait(globals())
del async_compile

def call(args):
    arg8_1, arg9_1, arg10_1 = args
    args.clear()
    s0 = arg8_1
    s2 = arg9_1
    assert_size_stride(arg10_1, (s0, 3, s2, s2), (3*(s2*s2), s2*s2, s2, 1))
    buf0 = empty_strided((s0, 3, s2, s2), (3*(s2*s2), 1, 3*s2, 3), device='cpu', dtype=torch.uint8)
    cpp_fused_quantize_per_tensor_0(c_void_p(arg10_1.data_ptr()), c_void_p(buf0.data_ptr()), c_long(s0), c_long(s2))
    del arg10_1
    buf1 = torch.ops.onednn.qconv2d_pointwise(buf0, 0.024776775389909744, 97, constant5, constant2, constant3, constant0, [1, 1], [1, 1], [1, 1], 1, 95.88209060714476, 0, False, 'relu', [], '')
    assert_size_stride(buf1, (s0, 16, 1 + s2, 1 + s2), (16 + (16*(s2*s2)) + (32*s2), 1, 16 + (16*s2), 16))
    del buf0
    # Source Nodes: [quantize_per_tensor_default_2], Original ATen: [quantized_decomposed.quantize_per_tensor]
    buf2 = torch.ops.quantized.max_pool2d(buf1, [3, 3], [2, 2], [1, 1], [1, 1], False)
    del buf1
    buf3 = buf2
    assert_size_stride(buf3, (s0, 16, 1 + (s2 // 2), 1 + (s2 // 2)), (16 + (16*((s2 // 2)*(s2 // 2))) + (32*(s2 // 2)), 1, 16 + (16*(s2 // 2)), 16))
    del buf2
    buf4 = empty_strided((s0, 16, 1, 1), (16, 1, 16*s0, 16*s0), device='cpu', dtype=torch.float32)
    buf5 = empty_strided((s0, 16), (16, 1), device='cpu', dtype=torch.uint8)
    cpp_fused_dequantize_per_tensor_mean_quantize_per_tensor_1(c_void_p(buf3.data_ptr()), c_void_p(buf4.data_ptr()), c_void_p(buf5.data_ptr()), c_long(s0), c_long(s2))
    del buf3
    buf6 = torch.ops.onednn.qlinear_pointwise(buf5, 0.005949148442596197, 0, constant6, constant4, constant3, constant1, 176.31645543014483, 100, False, 'none', [], '')
    assert_size_stride(buf6, (s0, 16), (16, 1))
    del buf5
    buf7 = reinterpret_tensor(buf4, (s0, 16), (16, 1)); del buf4  # reuse
    cpp_fused_dequantize_per_tensor_2(c_void_p(buf6.data_ptr()), c_void_p(buf7.data_ptr()), c_long(s0))
    return (buf7, )

```

**TestPlan**
```
python -m pytest test_mkldnn_pattern_matcher.py -k test_qconv2d_maxpool2d_linear_dynamic
```

cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng Xia-Weiwen wenzhe-nrv jiayisunx peterbell10 ipiszy ngimel yf225 chenyang78 kadeng muchulee8 aakhundov

[ghstack-poisoned]
…upport"


**Summary**
This Diff enables dynamic batch size support for quantization use case in Inductor. Take the UT in this PR as example, after this PR, the generated code will have assumption of dynamic input batch size.
```
cpp_fused_quantize_per_tensor_0 = async_compile.cpp('''
#include "/tmp/torchinductor_root/ib/cibrnuq56cxamjj4krp4zpjvsirbmlolpbnmomodzyd46huzhdw7.h"
extern "C" void kernel(const float* in_ptr0,
                       unsigned char* out_ptr0,
                       const long ks0,
                       const long ks1)
{
    {
        #pragma GCC ivdep
        for(long i0=static_cast<long>(0L); i0<static_cast<long>(ks0); i0+=static_cast<long>(1L))
        {
            #pragma GCC ivdep
            for(long i1=static_cast<long>(0L); i1<static_cast<long>(3L); i1+=static_cast<long>(1L))
            {
                #pragma GCC ivdep
                for(long i2=static_cast<long>(0L); i2<static_cast<long>(static_cast<long>(ks1*ks1)); i2+=static_cast<long>(1L))
                {
                    auto tmp0 = in_ptr0[static_cast<long>(i2 + (i1*(static_cast<long>(ks1*ks1))) + (3L*i0*(static_cast<long>(ks1*ks1))))];
                    auto tmp1 = static_cast<float>(40.36037717834931);
                    auto tmp2 = decltype(tmp0)(tmp0 * tmp1);
                    auto tmp3 = std::nearbyint(tmp2);
                    auto tmp4 = static_cast<float>(97.0);
                    auto tmp5 = tmp3 + tmp4;
                    auto tmp6 = static_cast<float>(0.0);
                    auto tmp7 = max_propagate_nan(tmp5, tmp6);
                    auto tmp8 = static_cast<float>(255.0);
                    auto tmp9 = min_propagate_nan(tmp7, tmp8);
                    auto tmp10 = static_cast<unsigned char>(tmp9);
                    out_ptr0[static_cast<long>(i1 + (3L*i2) + (3L*i0*(static_cast<long>(ks1*ks1))))] = tmp10;
                }
            }
        }
    }
}
''')


cpp_fused_dequantize_per_tensor_mean_quantize_per_tensor_1 = async_compile.cpp('''
#include "/tmp/torchinductor_root/ib/cibrnuq56cxamjj4krp4zpjvsirbmlolpbnmomodzyd46huzhdw7.h"
extern "C" void kernel(const unsigned char* in_ptr0,
                       float* out_ptr0,
                       unsigned char* out_ptr1,
                       const long ks0,
                       const long ks1)
{
    {
        #pragma GCC ivdep
        for(long i0=static_cast<long>(0L); i0<static_cast<long>(ks0); i0+=static_cast<long>(1L))
        {
            for(long i1=static_cast<long>(0L); i1<static_cast<long>(16L); i1+=static_cast<long>(16L))
            {
                {
                    #pragma omp declare reduction(+:at::vec::Vectorized<float>:omp_out = omp_out + omp_in) initializer(omp_priv={at::vec::Vectorized<float>(0)})
                    float tmp_acc0 = 0;
                    at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);
                    for(long i2=static_cast<long>(0L); i2<static_cast<long>(1L + (static_cast<long>((at::native::div_floor_integer(ks1, 2L))*(at::native::div_floor_integer(ks1, 2L)))) + (2L*(at::native::div_floor_integer(ks1, 2L)))); i2+=static_cast<long>(1L))
                    {
                        auto tmp0 = at::vec::Vectorized<uint8_t>::loadu_one_fourth(in_ptr0 + static_cast<long>(i1 + (16L*i0) + (16L*i2) + (16L*i0*(static_cast<long>((at::native::div_floor_integer(ks1, 2L))*(at::native::div_floor_integer(ks1, 2L))))) + (32L*i0*(at::native::div_floor_integer(ks1, 2L)))));
                        auto tmp1 = at::vec::convert_uint8_to_float(tmp0);
                        auto tmp2 = at::vec::Vectorized<float>(static_cast<float>(0.0));
                        auto tmp3 = tmp1 - tmp2;
                        auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(0.010429476387798786));
                        auto tmp5 = tmp3 * tmp4;
                        tmp_acc0_vec = tmp_acc0_vec + tmp5;
                    }
                    tmp_acc0_vec.store(out_ptr0 + static_cast<long>(i1 + (16L*i0)));
                }
            }
        }
    }
    {
        #pragma GCC ivdep
        for(long i0=static_cast<long>(0L); i0<static_cast<long>(16L*ks0); i0+=static_cast<long>(1L))
        {
            auto tmp0 = out_ptr0[static_cast<long>(i0)];
            auto tmp1 = static_cast<float>(1L + (static_cast<long>((at::native::div_floor_integer(ks1, 2L))*(at::native::div_floor_integer(ks1, 2L)))) + (2L*(at::native::div_floor_integer(ks1, 2L))));
            auto tmp2 = tmp0 / tmp1;
            auto tmp3 = static_cast<float>(168.09128392896545);
            auto tmp4 = decltype(tmp2)(tmp2 * tmp3);
            auto tmp5 = std::nearbyint(tmp4);
            auto tmp6 = static_cast<float>(0.0);
            auto tmp7 = tmp5 + tmp6;
            auto tmp8 = max_propagate_nan(tmp7, tmp6);
            auto tmp9 = static_cast<float>(255.0);
            auto tmp10 = min_propagate_nan(tmp8, tmp9);
            auto tmp11 = static_cast<unsigned char>(tmp10);
            out_ptr1[static_cast<long>(i0)] = tmp11;
        }
    }
}
''')


cpp_fused_dequantize_per_tensor_2 = async_compile.cpp('''
#include "/tmp/torchinductor_root/ib/cibrnuq56cxamjj4krp4zpjvsirbmlolpbnmomodzyd46huzhdw7.h"
extern "C" void kernel(const unsigned char* in_ptr0,
                       float* out_ptr0,
                       const long ks0)
{
    {
        for(long i0=static_cast<long>(0L); i0<static_cast<long>(16L*ks0); i0+=static_cast<long>(16L))
        {
            auto tmp0 = at::vec::Vectorized<uint8_t>::loadu_one_fourth(in_ptr0 + static_cast<long>(i0));
            auto tmp1 = at::vec::convert_uint8_to_float(tmp0);
            auto tmp2 = at::vec::Vectorized<float>(static_cast<float>(100.0));
            auto tmp3 = tmp1 - tmp2;
            auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(0.0056716203689575195));
            auto tmp5 = tmp3 * tmp4;
            tmp5.store(out_ptr0 + static_cast<long>(i0));
        }
    }
}
''')


async_compile.wait(globals())
del async_compile

def call(args):
    arg8_1, arg9_1, arg10_1 = args
    args.clear()
    s0 = arg8_1
    s2 = arg9_1
    assert_size_stride(arg10_1, (s0, 3, s2, s2), (3*(s2*s2), s2*s2, s2, 1))
    buf0 = empty_strided((s0, 3, s2, s2), (3*(s2*s2), 1, 3*s2, 3), device='cpu', dtype=torch.uint8)
    cpp_fused_quantize_per_tensor_0(c_void_p(arg10_1.data_ptr()), c_void_p(buf0.data_ptr()), c_long(s0), c_long(s2))
    del arg10_1
    buf1 = torch.ops.onednn.qconv2d_pointwise(buf0, 0.024776775389909744, 97, constant5, constant2, constant3, constant0, [1, 1], [1, 1], [1, 1], 1, 95.88209060714476, 0, False, 'relu', [], '')
    assert_size_stride(buf1, (s0, 16, 1 + s2, 1 + s2), (16 + (16*(s2*s2)) + (32*s2), 1, 16 + (16*s2), 16))
    del buf0
    # Source Nodes: [quantize_per_tensor_default_2], Original ATen: [quantized_decomposed.quantize_per_tensor]
    buf2 = torch.ops.quantized.max_pool2d(buf1, [3, 3], [2, 2], [1, 1], [1, 1], False)
    del buf1
    buf3 = buf2
    assert_size_stride(buf3, (s0, 16, 1 + (s2 // 2), 1 + (s2 // 2)), (16 + (16*((s2 // 2)*(s2 // 2))) + (32*(s2 // 2)), 1, 16 + (16*(s2 // 2)), 16))
    del buf2
    buf4 = empty_strided((s0, 16, 1, 1), (16, 1, 16*s0, 16*s0), device='cpu', dtype=torch.float32)
    buf5 = empty_strided((s0, 16), (16, 1), device='cpu', dtype=torch.uint8)
    cpp_fused_dequantize_per_tensor_mean_quantize_per_tensor_1(c_void_p(buf3.data_ptr()), c_void_p(buf4.data_ptr()), c_void_p(buf5.data_ptr()), c_long(s0), c_long(s2))
    del buf3
    buf6 = torch.ops.onednn.qlinear_pointwise(buf5, 0.005949148442596197, 0, constant6, constant4, constant3, constant1, 176.31645543014483, 100, False, 'none', [], '')
    assert_size_stride(buf6, (s0, 16), (16, 1))
    del buf5
    buf7 = reinterpret_tensor(buf4, (s0, 16), (16, 1)); del buf4  # reuse
    cpp_fused_dequantize_per_tensor_2(c_void_p(buf6.data_ptr()), c_void_p(buf7.data_ptr()), c_long(s0))
    return (buf7, )

```

**TestPlan**
```
python -m pytest test_mkldnn_pattern_matcher.py -k test_qconv2d_maxpool2d_linear_dynamic
```

cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng Xia-Weiwen wenzhe-nrv jiayisunx peterbell10 ipiszy ngimel yf225 chenyang78 kadeng muchulee8 aakhundov

[ghstack-poisoned]
@leslie-fang-intel leslie-fang-intel marked this pull request as ready for review September 7, 2023 07:49
@leslie-fang-intel
Copy link
Collaborator Author

Hi @eellison, Could you kindly help to take a look of this PR?

leslie-fang-intel added a commit that referenced this pull request Sep 8, 2023
ghstack-source-id: 81a5e6cc69b22bd22e90e2e75205ed29cbfe8a59
Pull Request resolved: #108550
…upport"


**Summary**
This Diff enables dynamic batch size support for quantization use case in Inductor. Take the UT in this PR as example, after this PR, the generated code will have assumption of dynamic input batch size.
```
cpp_fused_quantize_per_tensor_0 = async_compile.cpp('''
#include "/tmp/torchinductor_root/ib/cibrnuq56cxamjj4krp4zpjvsirbmlolpbnmomodzyd46huzhdw7.h"
extern "C" void kernel(const float* in_ptr0,
                       unsigned char* out_ptr0,
                       const long ks0,
                       const long ks1)
{
    {
        #pragma GCC ivdep
        for(long i0=static_cast<long>(0L); i0<static_cast<long>(ks0); i0+=static_cast<long>(1L))
        {
            #pragma GCC ivdep
            for(long i1=static_cast<long>(0L); i1<static_cast<long>(3L); i1+=static_cast<long>(1L))
            {
                #pragma GCC ivdep
                for(long i2=static_cast<long>(0L); i2<static_cast<long>(static_cast<long>(ks1*ks1)); i2+=static_cast<long>(1L))
                {
                    auto tmp0 = in_ptr0[static_cast<long>(i2 + (i1*(static_cast<long>(ks1*ks1))) + (3L*i0*(static_cast<long>(ks1*ks1))))];
                    auto tmp1 = static_cast<float>(40.36037717834931);
                    auto tmp2 = decltype(tmp0)(tmp0 * tmp1);
                    auto tmp3 = std::nearbyint(tmp2);
                    auto tmp4 = static_cast<float>(97.0);
                    auto tmp5 = tmp3 + tmp4;
                    auto tmp6 = static_cast<float>(0.0);
                    auto tmp7 = max_propagate_nan(tmp5, tmp6);
                    auto tmp8 = static_cast<float>(255.0);
                    auto tmp9 = min_propagate_nan(tmp7, tmp8);
                    auto tmp10 = static_cast<unsigned char>(tmp9);
                    out_ptr0[static_cast<long>(i1 + (3L*i2) + (3L*i0*(static_cast<long>(ks1*ks1))))] = tmp10;
                }
            }
        }
    }
}
''')


cpp_fused_dequantize_per_tensor_mean_quantize_per_tensor_1 = async_compile.cpp('''
#include "/tmp/torchinductor_root/ib/cibrnuq56cxamjj4krp4zpjvsirbmlolpbnmomodzyd46huzhdw7.h"
extern "C" void kernel(const unsigned char* in_ptr0,
                       float* out_ptr0,
                       unsigned char* out_ptr1,
                       const long ks0,
                       const long ks1)
{
    {
        #pragma GCC ivdep
        for(long i0=static_cast<long>(0L); i0<static_cast<long>(ks0); i0+=static_cast<long>(1L))
        {
            for(long i1=static_cast<long>(0L); i1<static_cast<long>(16L); i1+=static_cast<long>(16L))
            {
                {
                    #pragma omp declare reduction(+:at::vec::Vectorized<float>:omp_out = omp_out + omp_in) initializer(omp_priv={at::vec::Vectorized<float>(0)})
                    float tmp_acc0 = 0;
                    at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);
                    for(long i2=static_cast<long>(0L); i2<static_cast<long>(1L + (static_cast<long>((at::native::div_floor_integer(ks1, 2L))*(at::native::div_floor_integer(ks1, 2L)))) + (2L*(at::native::div_floor_integer(ks1, 2L)))); i2+=static_cast<long>(1L))
                    {
                        auto tmp0 = at::vec::Vectorized<uint8_t>::loadu_one_fourth(in_ptr0 + static_cast<long>(i1 + (16L*i0) + (16L*i2) + (16L*i0*(static_cast<long>((at::native::div_floor_integer(ks1, 2L))*(at::native::div_floor_integer(ks1, 2L))))) + (32L*i0*(at::native::div_floor_integer(ks1, 2L)))));
                        auto tmp1 = at::vec::convert_uint8_to_float(tmp0);
                        auto tmp2 = at::vec::Vectorized<float>(static_cast<float>(0.0));
                        auto tmp3 = tmp1 - tmp2;
                        auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(0.010429476387798786));
                        auto tmp5 = tmp3 * tmp4;
                        tmp_acc0_vec = tmp_acc0_vec + tmp5;
                    }
                    tmp_acc0_vec.store(out_ptr0 + static_cast<long>(i1 + (16L*i0)));
                }
            }
        }
    }
    {
        #pragma GCC ivdep
        for(long i0=static_cast<long>(0L); i0<static_cast<long>(16L*ks0); i0+=static_cast<long>(1L))
        {
            auto tmp0 = out_ptr0[static_cast<long>(i0)];
            auto tmp1 = static_cast<float>(1L + (static_cast<long>((at::native::div_floor_integer(ks1, 2L))*(at::native::div_floor_integer(ks1, 2L)))) + (2L*(at::native::div_floor_integer(ks1, 2L))));
            auto tmp2 = tmp0 / tmp1;
            auto tmp3 = static_cast<float>(168.09128392896545);
            auto tmp4 = decltype(tmp2)(tmp2 * tmp3);
            auto tmp5 = std::nearbyint(tmp4);
            auto tmp6 = static_cast<float>(0.0);
            auto tmp7 = tmp5 + tmp6;
            auto tmp8 = max_propagate_nan(tmp7, tmp6);
            auto tmp9 = static_cast<float>(255.0);
            auto tmp10 = min_propagate_nan(tmp8, tmp9);
            auto tmp11 = static_cast<unsigned char>(tmp10);
            out_ptr1[static_cast<long>(i0)] = tmp11;
        }
    }
}
''')


cpp_fused_dequantize_per_tensor_2 = async_compile.cpp('''
#include "/tmp/torchinductor_root/ib/cibrnuq56cxamjj4krp4zpjvsirbmlolpbnmomodzyd46huzhdw7.h"
extern "C" void kernel(const unsigned char* in_ptr0,
                       float* out_ptr0,
                       const long ks0)
{
    {
        for(long i0=static_cast<long>(0L); i0<static_cast<long>(16L*ks0); i0+=static_cast<long>(16L))
        {
            auto tmp0 = at::vec::Vectorized<uint8_t>::loadu_one_fourth(in_ptr0 + static_cast<long>(i0));
            auto tmp1 = at::vec::convert_uint8_to_float(tmp0);
            auto tmp2 = at::vec::Vectorized<float>(static_cast<float>(100.0));
            auto tmp3 = tmp1 - tmp2;
            auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(0.0056716203689575195));
            auto tmp5 = tmp3 * tmp4;
            tmp5.store(out_ptr0 + static_cast<long>(i0));
        }
    }
}
''')


async_compile.wait(globals())
del async_compile

def call(args):
    arg8_1, arg9_1, arg10_1 = args
    args.clear()
    s0 = arg8_1
    s2 = arg9_1
    assert_size_stride(arg10_1, (s0, 3, s2, s2), (3*(s2*s2), s2*s2, s2, 1))
    buf0 = empty_strided((s0, 3, s2, s2), (3*(s2*s2), 1, 3*s2, 3), device='cpu', dtype=torch.uint8)
    cpp_fused_quantize_per_tensor_0(c_void_p(arg10_1.data_ptr()), c_void_p(buf0.data_ptr()), c_long(s0), c_long(s2))
    del arg10_1
    buf1 = torch.ops.onednn.qconv2d_pointwise(buf0, 0.024776775389909744, 97, constant5, constant2, constant3, constant0, [1, 1], [1, 1], [1, 1], 1, 95.88209060714476, 0, False, 'relu', [], '')
    assert_size_stride(buf1, (s0, 16, 1 + s2, 1 + s2), (16 + (16*(s2*s2)) + (32*s2), 1, 16 + (16*s2), 16))
    del buf0
    # Source Nodes: [quantize_per_tensor_default_2], Original ATen: [quantized_decomposed.quantize_per_tensor]
    buf2 = torch.ops.quantized.max_pool2d(buf1, [3, 3], [2, 2], [1, 1], [1, 1], False)
    del buf1
    buf3 = buf2
    assert_size_stride(buf3, (s0, 16, 1 + (s2 // 2), 1 + (s2 // 2)), (16 + (16*((s2 // 2)*(s2 // 2))) + (32*(s2 // 2)), 1, 16 + (16*(s2 // 2)), 16))
    del buf2
    buf4 = empty_strided((s0, 16, 1, 1), (16, 1, 16*s0, 16*s0), device='cpu', dtype=torch.float32)
    buf5 = empty_strided((s0, 16), (16, 1), device='cpu', dtype=torch.uint8)
    cpp_fused_dequantize_per_tensor_mean_quantize_per_tensor_1(c_void_p(buf3.data_ptr()), c_void_p(buf4.data_ptr()), c_void_p(buf5.data_ptr()), c_long(s0), c_long(s2))
    del buf3
    buf6 = torch.ops.onednn.qlinear_pointwise(buf5, 0.005949148442596197, 0, constant6, constant4, constant3, constant1, 176.31645543014483, 100, False, 'none', [], '')
    assert_size_stride(buf6, (s0, 16), (16, 1))
    del buf5
    buf7 = reinterpret_tensor(buf4, (s0, 16), (16, 1)); del buf4  # reuse
    cpp_fused_dequantize_per_tensor_2(c_void_p(buf6.data_ptr()), c_void_p(buf7.data_ptr()), c_long(s0))
    return (buf7, )

```

**TestPlan**
```
python -m pytest test_mkldnn_pattern_matcher.py -k test_qconv2d_maxpool2d_linear_dynamic
```

cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng Xia-Weiwen wenzhe-nrv jiayisunx peterbell10 ipiszy ngimel yf225 chenyang78 kadeng muchulee8 aakhundov

[ghstack-poisoned]
@leslie-fang-intel
Copy link
Collaborator Author

Hi @eellison, Could you kindly help to take a look of this PR?

leslie-fang-intel added a commit that referenced this pull request Sep 14, 2023
ghstack-source-id: d43f4ae2477b3ec25a62a11727d2f1a5db6aa7df
Pull Request resolved: #108550
…upport"


**Summary**
This Diff enables dynamic batch size support for quantization use case in Inductor. Take the UT in this PR as example, after this PR, the generated code will have assumption of dynamic input batch size.
```
cpp_fused_quantize_per_tensor_0 = async_compile.cpp('''
#include "/tmp/torchinductor_root/ib/cibrnuq56cxamjj4krp4zpjvsirbmlolpbnmomodzyd46huzhdw7.h"
extern "C" void kernel(const float* in_ptr0,
                       unsigned char* out_ptr0,
                       const long ks0,
                       const long ks1)
{
    {
        #pragma GCC ivdep
        for(long i0=static_cast<long>(0L); i0<static_cast<long>(ks0); i0+=static_cast<long>(1L))
        {
            #pragma GCC ivdep
            for(long i1=static_cast<long>(0L); i1<static_cast<long>(3L); i1+=static_cast<long>(1L))
            {
                #pragma GCC ivdep
                for(long i2=static_cast<long>(0L); i2<static_cast<long>(static_cast<long>(ks1*ks1)); i2+=static_cast<long>(1L))
                {
                    auto tmp0 = in_ptr0[static_cast<long>(i2 + (i1*(static_cast<long>(ks1*ks1))) + (3L*i0*(static_cast<long>(ks1*ks1))))];
                    auto tmp1 = static_cast<float>(40.36037717834931);
                    auto tmp2 = decltype(tmp0)(tmp0 * tmp1);
                    auto tmp3 = std::nearbyint(tmp2);
                    auto tmp4 = static_cast<float>(97.0);
                    auto tmp5 = tmp3 + tmp4;
                    auto tmp6 = static_cast<float>(0.0);
                    auto tmp7 = max_propagate_nan(tmp5, tmp6);
                    auto tmp8 = static_cast<float>(255.0);
                    auto tmp9 = min_propagate_nan(tmp7, tmp8);
                    auto tmp10 = static_cast<unsigned char>(tmp9);
                    out_ptr0[static_cast<long>(i1 + (3L*i2) + (3L*i0*(static_cast<long>(ks1*ks1))))] = tmp10;
                }
            }
        }
    }
}
''')


cpp_fused_dequantize_per_tensor_mean_quantize_per_tensor_1 = async_compile.cpp('''
#include "/tmp/torchinductor_root/ib/cibrnuq56cxamjj4krp4zpjvsirbmlolpbnmomodzyd46huzhdw7.h"
extern "C" void kernel(const unsigned char* in_ptr0,
                       float* out_ptr0,
                       unsigned char* out_ptr1,
                       const long ks0,
                       const long ks1)
{
    {
        #pragma GCC ivdep
        for(long i0=static_cast<long>(0L); i0<static_cast<long>(ks0); i0+=static_cast<long>(1L))
        {
            for(long i1=static_cast<long>(0L); i1<static_cast<long>(16L); i1+=static_cast<long>(16L))
            {
                {
                    #pragma omp declare reduction(+:at::vec::Vectorized<float>:omp_out = omp_out + omp_in) initializer(omp_priv={at::vec::Vectorized<float>(0)})
                    float tmp_acc0 = 0;
                    at::vec::Vectorized<float> tmp_acc0_vec = at::vec::Vectorized<float>(0);
                    for(long i2=static_cast<long>(0L); i2<static_cast<long>(1L + (static_cast<long>((at::native::div_floor_integer(ks1, 2L))*(at::native::div_floor_integer(ks1, 2L)))) + (2L*(at::native::div_floor_integer(ks1, 2L)))); i2+=static_cast<long>(1L))
                    {
                        auto tmp0 = at::vec::Vectorized<uint8_t>::loadu_one_fourth(in_ptr0 + static_cast<long>(i1 + (16L*i0) + (16L*i2) + (16L*i0*(static_cast<long>((at::native::div_floor_integer(ks1, 2L))*(at::native::div_floor_integer(ks1, 2L))))) + (32L*i0*(at::native::div_floor_integer(ks1, 2L)))));
                        auto tmp1 = at::vec::convert_uint8_to_float(tmp0);
                        auto tmp2 = at::vec::Vectorized<float>(static_cast<float>(0.0));
                        auto tmp3 = tmp1 - tmp2;
                        auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(0.010429476387798786));
                        auto tmp5 = tmp3 * tmp4;
                        tmp_acc0_vec = tmp_acc0_vec + tmp5;
                    }
                    tmp_acc0_vec.store(out_ptr0 + static_cast<long>(i1 + (16L*i0)));
                }
            }
        }
    }
    {
        #pragma GCC ivdep
        for(long i0=static_cast<long>(0L); i0<static_cast<long>(16L*ks0); i0+=static_cast<long>(1L))
        {
            auto tmp0 = out_ptr0[static_cast<long>(i0)];
            auto tmp1 = static_cast<float>(1L + (static_cast<long>((at::native::div_floor_integer(ks1, 2L))*(at::native::div_floor_integer(ks1, 2L)))) + (2L*(at::native::div_floor_integer(ks1, 2L))));
            auto tmp2 = tmp0 / tmp1;
            auto tmp3 = static_cast<float>(168.09128392896545);
            auto tmp4 = decltype(tmp2)(tmp2 * tmp3);
            auto tmp5 = std::nearbyint(tmp4);
            auto tmp6 = static_cast<float>(0.0);
            auto tmp7 = tmp5 + tmp6;
            auto tmp8 = max_propagate_nan(tmp7, tmp6);
            auto tmp9 = static_cast<float>(255.0);
            auto tmp10 = min_propagate_nan(tmp8, tmp9);
            auto tmp11 = static_cast<unsigned char>(tmp10);
            out_ptr1[static_cast<long>(i0)] = tmp11;
        }
    }
}
''')


cpp_fused_dequantize_per_tensor_2 = async_compile.cpp('''
#include "/tmp/torchinductor_root/ib/cibrnuq56cxamjj4krp4zpjvsirbmlolpbnmomodzyd46huzhdw7.h"
extern "C" void kernel(const unsigned char* in_ptr0,
                       float* out_ptr0,
                       const long ks0)
{
    {
        for(long i0=static_cast<long>(0L); i0<static_cast<long>(16L*ks0); i0+=static_cast<long>(16L))
        {
            auto tmp0 = at::vec::Vectorized<uint8_t>::loadu_one_fourth(in_ptr0 + static_cast<long>(i0));
            auto tmp1 = at::vec::convert_uint8_to_float(tmp0);
            auto tmp2 = at::vec::Vectorized<float>(static_cast<float>(100.0));
            auto tmp3 = tmp1 - tmp2;
            auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(0.0056716203689575195));
            auto tmp5 = tmp3 * tmp4;
            tmp5.store(out_ptr0 + static_cast<long>(i0));
        }
    }
}
''')


async_compile.wait(globals())
del async_compile

def call(args):
    arg8_1, arg9_1, arg10_1 = args
    args.clear()
    s0 = arg8_1
    s2 = arg9_1
    assert_size_stride(arg10_1, (s0, 3, s2, s2), (3*(s2*s2), s2*s2, s2, 1))
    buf0 = empty_strided((s0, 3, s2, s2), (3*(s2*s2), 1, 3*s2, 3), device='cpu', dtype=torch.uint8)
    cpp_fused_quantize_per_tensor_0(c_void_p(arg10_1.data_ptr()), c_void_p(buf0.data_ptr()), c_long(s0), c_long(s2))
    del arg10_1
    buf1 = torch.ops.onednn.qconv2d_pointwise(buf0, 0.024776775389909744, 97, constant5, constant2, constant3, constant0, [1, 1], [1, 1], [1, 1], 1, 95.88209060714476, 0, False, 'relu', [], '')
    assert_size_stride(buf1, (s0, 16, 1 + s2, 1 + s2), (16 + (16*(s2*s2)) + (32*s2), 1, 16 + (16*s2), 16))
    del buf0
    # Source Nodes: [quantize_per_tensor_default_2], Original ATen: [quantized_decomposed.quantize_per_tensor]
    buf2 = torch.ops.quantized.max_pool2d(buf1, [3, 3], [2, 2], [1, 1], [1, 1], False)
    del buf1
    buf3 = buf2
    assert_size_stride(buf3, (s0, 16, 1 + (s2 // 2), 1 + (s2 // 2)), (16 + (16*((s2 // 2)*(s2 // 2))) + (32*(s2 // 2)), 1, 16 + (16*(s2 // 2)), 16))
    del buf2
    buf4 = empty_strided((s0, 16, 1, 1), (16, 1, 16*s0, 16*s0), device='cpu', dtype=torch.float32)
    buf5 = empty_strided((s0, 16), (16, 1), device='cpu', dtype=torch.uint8)
    cpp_fused_dequantize_per_tensor_mean_quantize_per_tensor_1(c_void_p(buf3.data_ptr()), c_void_p(buf4.data_ptr()), c_void_p(buf5.data_ptr()), c_long(s0), c_long(s2))
    del buf3
    buf6 = torch.ops.onednn.qlinear_pointwise(buf5, 0.005949148442596197, 0, constant6, constant4, constant3, constant1, 176.31645543014483, 100, False, 'none', [], '')
    assert_size_stride(buf6, (s0, 16), (16, 1))
    del buf5
    buf7 = reinterpret_tensor(buf4, (s0, 16), (16, 1)); del buf4  # reuse
    cpp_fused_dequantize_per_tensor_2(c_void_p(buf6.data_ptr()), c_void_p(buf7.data_ptr()), c_long(s0))
    return (buf7, )

```

**TestPlan**
```
python -m pytest test_mkldnn_pattern_matcher.py -k test_qconv2d_maxpool2d_linear_dynamic
```

cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng Xia-Weiwen wenzhe-nrv jiayisunx peterbell10 ipiszy ngimel yf225 chenyang78 kadeng muchulee8 aakhundov

[ghstack-poisoned]
@leslie-fang-intel
Copy link
Collaborator Author

@pytorchbot merge

@pytorchmergebot
Copy link
Collaborator

Merge failed

Reason: This PR needs a release notes: label
If your changes are user facing and intended to be a part of release notes, please use a label starting with release notes:.

If not, please add the topic: not user facing label.

To add a label, you can comment to pytorchbot, for example
@pytorchbot label "topic: not user facing"

For more information, see
https://github.com/pytorch/pytorch/wiki/PyTorch-AutoLabel-Bot#why-categorize-for-release-notes-and-how-does-it-work.

Details for Dev Infra team Raised by workflow job

@leslie-fang-intel
Copy link
Collaborator Author

@pytorchbot merge

@pytorchmergebot
Copy link
Collaborator

Merge started

Your change will be merged once all checks pass (ETA 0-4 Hours).

Learn more about merging in the wiki.

Questions? Feedback? Please reach out to the PyTorch DevX Team

Advanced Debugging
Check the merge workflow status
here

@facebook-github-bot facebook-github-bot deleted the gh/leslie-fang-intel/78/head branch September 22, 2023 14:25
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

None yet

5 participants