Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

inductor: remove redundant memory copy when view a ExternKernelAlloc buffer #108635

Closed

Commits on Sep 6, 2023

  1. Configuration menu
    Copy the full SHA
    4b8b3a1 View commit details
    Browse the repository at this point in the history

Commits on Sep 8, 2023

  1. Update on "inductor: remove redundant memory copy when view a ExternK…

    …ernelAlloc buffer"
    
    
    
    When viewing a ExternKernelAlloc buffer, there always have a redundant memory copy:
    ```
    buf0: ExternKernelSchedulerNode(MKLPackedLinear)
    buf0.writes = [StarDep(name='buf0')]
    buf0.unmet_dependencies = []
    buf0.met_dependencies = [StarDep(name='arg1_1'), StarDep(name='constant0'), StarDep(name='constant1')]
    buf0.users = [NodeUser(node=SchedulerNode(name='buf1'), can_inplace=True, is_weak=False)]
    buf0.node.kernel = torch.ops.mkl._mkl_linear
    
    
    buf1: SchedulerNode(ComputedBuffer)
    buf1.writes = [MemoryDep('buf1', c0, {c0: 64})]
    buf1.unmet_dependencies = [MemoryDep('buf0', c0, {c0: 64})]
    buf1.met_dependencies = []
    buf1.users = [NodeUser(node=OUTPUT, can_inplace=False, is_weak=False)]
    buf1.group.device = cpu
    buf1.group.iteration = ((64,), ())
    buf1.sizes = ([64], [])
    class buf1_loop_body:
        var_ranges = {z0: 64}
        index0 = z0
        def body(self, ops):
            get_index = self.get_index('index0')
            load = ops.load('buf0', get_index)
            get_index_1 = self.get_index('index0')
            store = ops.store('buf1', get_index_1, load, None)
            return store
    ```
    
    and the cpp backend-generated code is:
    ```
    cpp_fused_view_0 = async_compile.cpp('''
    #include "/tmp/torchinductor_xiaobing/ib/cibrnuq56cxamjj4krp4zpjvsirbmlolpbnmomodzyd46huzhdw7.h"
    extern "C" void kernel(float* in_out_ptr0)
    {
        #pragma omp parallel num_threads(40)
        {
            {
                #pragma omp for
                for(long i0=static_cast<long>(0L); i0<static_cast<long>(64L); i0+=static_cast<long>(16L))
                {
                    auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + static_cast<long>(i0));
                    tmp0.store(in_out_ptr0 + static_cast<long>(i0));
                }
            }
        }
    }
    ''')
    
    
    async_compile.wait(globals())
    del async_compile
    
    def call(args):
        arg1_1, = args
        args.clear()
        assert_size_stride(arg1_1, (4, 16), (16, 1))
        buf0 = torch.ops.mkl._mkl_linear(arg1_1, constant1, constant0, None, 4)
        del arg1_1
        buf1 = reinterpret_tensor(buf0, (4, 4, 4), (16, 4, 1)); del buf0  # reuse
        cpp_fused_view_0(c_void_p(buf1.data_ptr()))
        return (buf1, )
    ```
    
    For the ExternKernelAlloc buffer, we can do a real view, rather than a memory copy.
    
    cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen zhuhaozhe blzheng Xia-Weiwen wenzhe-nrv jiayisunx peterbell10 ipiszy ngimel yf225 chenyang78 kadeng muchulee8 aakhundov
    
    [ghstack-poisoned]
    XiaobingSuper committed Sep 8, 2023
    Configuration menu
    Copy the full SHA
    6e328ed View commit details
    Browse the repository at this point in the history
  2. Update on "inductor: remove redundant memory copy when view a ExternK…

    …ernelAlloc buffer"
    
    
    
    When viewing a ExternKernelAlloc buffer, there always have a redundant memory copy:
    ```
    buf0: ExternKernelSchedulerNode(MKLPackedLinear)
    buf0.writes = [StarDep(name='buf0')]
    buf0.unmet_dependencies = []
    buf0.met_dependencies = [StarDep(name='arg1_1'), StarDep(name='constant0'), StarDep(name='constant1')]
    buf0.users = [NodeUser(node=SchedulerNode(name='buf1'), can_inplace=True, is_weak=False)]
    buf0.node.kernel = torch.ops.mkl._mkl_linear
    
    
    buf1: SchedulerNode(ComputedBuffer)
    buf1.writes = [MemoryDep('buf1', c0, {c0: 64})]
    buf1.unmet_dependencies = [MemoryDep('buf0', c0, {c0: 64})]
    buf1.met_dependencies = []
    buf1.users = [NodeUser(node=OUTPUT, can_inplace=False, is_weak=False)]
    buf1.group.device = cpu
    buf1.group.iteration = ((64,), ())
    buf1.sizes = ([64], [])
    class buf1_loop_body:
        var_ranges = {z0: 64}
        index0 = z0
        def body(self, ops):
            get_index = self.get_index('index0')
            load = ops.load('buf0', get_index)
            get_index_1 = self.get_index('index0')
            store = ops.store('buf1', get_index_1, load, None)
            return store
    ```
    
    and the cpp backend-generated code is:
    ```
    cpp_fused_view_0 = async_compile.cpp('''
    #include "/tmp/torchinductor_xiaobing/ib/cibrnuq56cxamjj4krp4zpjvsirbmlolpbnmomodzyd46huzhdw7.h"
    extern "C" void kernel(float* in_out_ptr0)
    {
        #pragma omp parallel num_threads(40)
        {
            {
                #pragma omp for
                for(long i0=static_cast<long>(0L); i0<static_cast<long>(64L); i0+=static_cast<long>(16L))
                {
                    auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + static_cast<long>(i0));
                    tmp0.store(in_out_ptr0 + static_cast<long>(i0));
                }
            }
        }
    }
    ''')
    
    
    async_compile.wait(globals())
    del async_compile
    
    def call(args):
        arg1_1, = args
        args.clear()
        assert_size_stride(arg1_1, (4, 16), (16, 1))
        buf0 = torch.ops.mkl._mkl_linear(arg1_1, constant1, constant0, None, 4)
        del arg1_1
        buf1 = reinterpret_tensor(buf0, (4, 4, 4), (16, 4, 1)); del buf0  # reuse
        cpp_fused_view_0(c_void_p(buf1.data_ptr()))
        return (buf1, )
    ```
    
    For the ExternKernelAlloc buffer, we can do a real view, rather than a memory copy.
    
    cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen zhuhaozhe blzheng Xia-Weiwen wenzhe-nrv jiayisunx peterbell10 ipiszy ngimel yf225 chenyang78 kadeng muchulee8 aakhundov
    
    [ghstack-poisoned]
    XiaobingSuper committed Sep 8, 2023
    Configuration menu
    Copy the full SHA
    f98eff9 View commit details
    Browse the repository at this point in the history