Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

inductor: remove redundant memory copy when view a ExternKernelAlloc buffer #108635

Closed

Conversation

XiaobingSuper
Copy link
Collaborator

@XiaobingSuper XiaobingSuper commented Sep 6, 2023

Stack from ghstack (oldest at bottom):

When viewing a ExternKernelAlloc buffer, there always have a redundant memory copy:

buf0: ExternKernelSchedulerNode(MKLPackedLinear)
buf0.writes = [StarDep(name='buf0')]
buf0.unmet_dependencies = []
buf0.met_dependencies = [StarDep(name='arg1_1'), StarDep(name='constant0'), StarDep(name='constant1')]
buf0.users = [NodeUser(node=SchedulerNode(name='buf1'), can_inplace=True, is_weak=False)]
buf0.node.kernel = torch.ops.mkl._mkl_linear


buf1: SchedulerNode(ComputedBuffer)
buf1.writes = [MemoryDep('buf1', c0, {c0: 64})]
buf1.unmet_dependencies = [MemoryDep('buf0', c0, {c0: 64})]
buf1.met_dependencies = []
buf1.users = [NodeUser(node=OUTPUT, can_inplace=False, is_weak=False)]
buf1.group.device = cpu
buf1.group.iteration = ((64,), ())
buf1.sizes = ([64], [])
class buf1_loop_body:
    var_ranges = {z0: 64}
    index0 = z0
    def body(self, ops):
        get_index = self.get_index('index0')
        load = ops.load('buf0', get_index)
        get_index_1 = self.get_index('index0')
        store = ops.store('buf1', get_index_1, load, None)
        return store

and the cpp backend-generated code is:

cpp_fused_view_0 = async_compile.cpp('''
#include "/tmp/torchinductor_xiaobing/ib/cibrnuq56cxamjj4krp4zpjvsirbmlolpbnmomodzyd46huzhdw7.h"
extern "C" void kernel(float* in_out_ptr0)
{
    #pragma omp parallel num_threads(40)
    {
        {
            #pragma omp for
            for(long i0=static_cast<long>(0L); i0<static_cast<long>(64L); i0+=static_cast<long>(16L))
            {
                auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + static_cast<long>(i0));
                tmp0.store(in_out_ptr0 + static_cast<long>(i0));
            }
        }
    }
}
''')


async_compile.wait(globals())
del async_compile

def call(args):
    arg1_1, = args
    args.clear()
    assert_size_stride(arg1_1, (4, 16), (16, 1))
    buf0 = torch.ops.mkl._mkl_linear(arg1_1, constant1, constant0, None, 4)
    del arg1_1
    buf1 = reinterpret_tensor(buf0, (4, 4, 4), (16, 4, 1)); del buf0  # reuse
    cpp_fused_view_0(c_void_p(buf1.data_ptr()))
    return (buf1, )

For the ExternKernelAlloc buffer, we can do a real view, rather than a memory copy.

cc @voznesenskym @penguinwu @EikanWang @jgong5 @Guobing-Chen @zhuhaozhe @blzheng @Xia-Weiwen @wenzhe-nrv @jiayisunx @peterbell10 @ipiszy @ngimel @yf225 @chenyang78 @kadeng @muchulee8 @aakhundov

@pytorch-bot
Copy link

pytorch-bot bot commented Sep 6, 2023

🔗 Helpful Links

🧪 See artifacts and rendered test results at hud.pytorch.org/pr/108635

Note: Links to docs will display an error until the docs builds have been completed.

❗ 1 Active SEVs

There are 1 currently active SEVs. If your PR is affected, please view them below:

✅ You can merge normally! (1 Unrelated Failure)

As of commit f98eff9 with merge base ff38c0e (image):

FLAKY - The following job failed but was likely due to flakiness present on trunk:

This comment was automatically generated by Dr. CI and updates every 15 minutes.

XiaobingSuper added a commit that referenced this pull request Sep 6, 2023
…buffer

ghstack-source-id: ba3e02cd417ae706ccec93d9fe178f6caa42a2c7
Pull Request resolved: #108635
…ernelAlloc buffer"



When viewing a ExternKernelAlloc buffer, there always have a redundant memory copy:
```
buf0: ExternKernelSchedulerNode(MKLPackedLinear)
buf0.writes = [StarDep(name='buf0')]
buf0.unmet_dependencies = []
buf0.met_dependencies = [StarDep(name='arg1_1'), StarDep(name='constant0'), StarDep(name='constant1')]
buf0.users = [NodeUser(node=SchedulerNode(name='buf1'), can_inplace=True, is_weak=False)]
buf0.node.kernel = torch.ops.mkl._mkl_linear


buf1: SchedulerNode(ComputedBuffer)
buf1.writes = [MemoryDep('buf1', c0, {c0: 64})]
buf1.unmet_dependencies = [MemoryDep('buf0', c0, {c0: 64})]
buf1.met_dependencies = []
buf1.users = [NodeUser(node=OUTPUT, can_inplace=False, is_weak=False)]
buf1.group.device = cpu
buf1.group.iteration = ((64,), ())
buf1.sizes = ([64], [])
class buf1_loop_body:
    var_ranges = {z0: 64}
    index0 = z0
    def body(self, ops):
        get_index = self.get_index('index0')
        load = ops.load('buf0', get_index)
        get_index_1 = self.get_index('index0')
        store = ops.store('buf1', get_index_1, load, None)
        return store
```

and the cpp backend-generated code is:
```
cpp_fused_view_0 = async_compile.cpp('''
#include "/tmp/torchinductor_xiaobing/ib/cibrnuq56cxamjj4krp4zpjvsirbmlolpbnmomodzyd46huzhdw7.h"
extern "C" void kernel(float* in_out_ptr0)
{
    #pragma omp parallel num_threads(40)
    {
        {
            #pragma omp for
            for(long i0=static_cast<long>(0L); i0<static_cast<long>(64L); i0+=static_cast<long>(16L))
            {
                auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + static_cast<long>(i0));
                tmp0.store(in_out_ptr0 + static_cast<long>(i0));
            }
        }
    }
}
''')


async_compile.wait(globals())
del async_compile

def call(args):
    arg1_1, = args
    args.clear()
    assert_size_stride(arg1_1, (4, 16), (16, 1))
    buf0 = torch.ops.mkl._mkl_linear(arg1_1, constant1, constant0, None, 4)
    del arg1_1
    buf1 = reinterpret_tensor(buf0, (4, 4, 4), (16, 4, 1)); del buf0  # reuse
    cpp_fused_view_0(c_void_p(buf1.data_ptr()))
    return (buf1, )
```

For the ExternKernelAlloc buffer, we can do a real view, rather than a memory copy.

cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen zhuhaozhe blzheng Xia-Weiwen wenzhe-nrv jiayisunx peterbell10 ipiszy ngimel yf225 chenyang78 kadeng muchulee8 aakhundov

[ghstack-poisoned]
XiaobingSuper added a commit that referenced this pull request Sep 8, 2023
…buffer

ghstack-source-id: e47660d74d1dcc98a87acffc55d4a4f898abafeb
Pull Request resolved: #108635
@XiaobingSuper XiaobingSuper added the ciflow/trunk Trigger trunk jobs on your pull request label Sep 8, 2023
…ernelAlloc buffer"



When viewing a ExternKernelAlloc buffer, there always have a redundant memory copy:
```
buf0: ExternKernelSchedulerNode(MKLPackedLinear)
buf0.writes = [StarDep(name='buf0')]
buf0.unmet_dependencies = []
buf0.met_dependencies = [StarDep(name='arg1_1'), StarDep(name='constant0'), StarDep(name='constant1')]
buf0.users = [NodeUser(node=SchedulerNode(name='buf1'), can_inplace=True, is_weak=False)]
buf0.node.kernel = torch.ops.mkl._mkl_linear


buf1: SchedulerNode(ComputedBuffer)
buf1.writes = [MemoryDep('buf1', c0, {c0: 64})]
buf1.unmet_dependencies = [MemoryDep('buf0', c0, {c0: 64})]
buf1.met_dependencies = []
buf1.users = [NodeUser(node=OUTPUT, can_inplace=False, is_weak=False)]
buf1.group.device = cpu
buf1.group.iteration = ((64,), ())
buf1.sizes = ([64], [])
class buf1_loop_body:
    var_ranges = {z0: 64}
    index0 = z0
    def body(self, ops):
        get_index = self.get_index('index0')
        load = ops.load('buf0', get_index)
        get_index_1 = self.get_index('index0')
        store = ops.store('buf1', get_index_1, load, None)
        return store
```

and the cpp backend-generated code is:
```
cpp_fused_view_0 = async_compile.cpp('''
#include "/tmp/torchinductor_xiaobing/ib/cibrnuq56cxamjj4krp4zpjvsirbmlolpbnmomodzyd46huzhdw7.h"
extern "C" void kernel(float* in_out_ptr0)
{
    #pragma omp parallel num_threads(40)
    {
        {
            #pragma omp for
            for(long i0=static_cast<long>(0L); i0<static_cast<long>(64L); i0+=static_cast<long>(16L))
            {
                auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + static_cast<long>(i0));
                tmp0.store(in_out_ptr0 + static_cast<long>(i0));
            }
        }
    }
}
''')


async_compile.wait(globals())
del async_compile

def call(args):
    arg1_1, = args
    args.clear()
    assert_size_stride(arg1_1, (4, 16), (16, 1))
    buf0 = torch.ops.mkl._mkl_linear(arg1_1, constant1, constant0, None, 4)
    del arg1_1
    buf1 = reinterpret_tensor(buf0, (4, 4, 4), (16, 4, 1)); del buf0  # reuse
    cpp_fused_view_0(c_void_p(buf1.data_ptr()))
    return (buf1, )
```

For the ExternKernelAlloc buffer, we can do a real view, rather than a memory copy.

cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen zhuhaozhe blzheng Xia-Weiwen wenzhe-nrv jiayisunx peterbell10 ipiszy ngimel yf225 chenyang78 kadeng muchulee8 aakhundov

[ghstack-poisoned]
XiaobingSuper added a commit that referenced this pull request Sep 8, 2023
…buffer

ghstack-source-id: 5566e5ca067c702adad9d86a77a101b43269ab9c
Pull Request resolved: #108635
@XiaobingSuper
Copy link
Collaborator Author

@pytorchbot merge

@pytorchmergebot
Copy link
Collaborator

Merge started

Your change will be merged once all checks pass (ETA 0-4 Hours).

Learn more about merging in the wiki.

Questions? Feedback? Please reach out to the PyTorch DevX Team

Advanced Debugging
Check the merge workflow status
here

@pytorchmergebot
Copy link
Collaborator

Merge failed

Reason: 1 jobs have failed, first few of them are: inductor-periodic / cuda12.1-py3.10-gcc9-sm86-periodic-dynamo-benchmarks / test (dynamo_eager_torchbench, 1, 1, linux.g5.4xlarge.nvidia.gpu)

Details for Dev Infra team Raised by workflow job

@XiaobingSuper
Copy link
Collaborator Author

@pytorchbot merge -f "flaky issue, not related to this pr"

@pytorchmergebot
Copy link
Collaborator

Merge started

Your change will be merged immediately since you used the force (-f) flag, bypassing any CI checks (ETA: 1-5 minutes). Please use -f as last resort and instead consider -i/--ignore-current to continue the merge ignoring current failures. This will allow currently pending tests to finish and report signal before the merge.

Learn more about merging in the wiki.

Questions? Feedback? Please reach out to the PyTorch DevX Team

Advanced Debugging
Check the merge workflow status
here

@facebook-github-bot facebook-github-bot deleted the gh/XiaobingSuper/163/head branch September 14, 2023 14:22
michiboo pushed a commit to michiboo/pytorch that referenced this pull request Sep 17, 2023
…buffer (pytorch#108635)

When viewing a ExternKernelAlloc buffer, there always have a redundant memory copy:
```
buf0: ExternKernelSchedulerNode(MKLPackedLinear)
buf0.writes = [StarDep(name='buf0')]
buf0.unmet_dependencies = []
buf0.met_dependencies = [StarDep(name='arg1_1'), StarDep(name='constant0'), StarDep(name='constant1')]
buf0.users = [NodeUser(node=SchedulerNode(name='buf1'), can_inplace=True, is_weak=False)]
buf0.node.kernel = torch.ops.mkl._mkl_linear

buf1: SchedulerNode(ComputedBuffer)
buf1.writes = [MemoryDep('buf1', c0, {c0: 64})]
buf1.unmet_dependencies = [MemoryDep('buf0', c0, {c0: 64})]
buf1.met_dependencies = []
buf1.users = [NodeUser(node=OUTPUT, can_inplace=False, is_weak=False)]
buf1.group.device = cpu
buf1.group.iteration = ((64,), ())
buf1.sizes = ([64], [])
class buf1_loop_body:
    var_ranges = {z0: 64}
    index0 = z0
    def body(self, ops):
        get_index = self.get_index('index0')
        load = ops.load('buf0', get_index)
        get_index_1 = self.get_index('index0')
        store = ops.store('buf1', get_index_1, load, None)
        return store
```

and the cpp backend-generated code is:
```
cpp_fused_view_0 = async_compile.cpp('''
extern "C" void kernel(float* in_out_ptr0)
{
    #pragma omp parallel num_threads(40)
    {
        {
            #pragma omp for
            for(long i0=static_cast<long>(0L); i0<static_cast<long>(64L); i0+=static_cast<long>(16L))
            {
                auto tmp0 = at::vec::Vectorized<float>::loadu(in_out_ptr0 + static_cast<long>(i0));
                tmp0.store(in_out_ptr0 + static_cast<long>(i0));
            }
        }
    }
}
''')

async_compile.wait(globals())
del async_compile

def call(args):
    arg1_1, = args
    args.clear()
    assert_size_stride(arg1_1, (4, 16), (16, 1))
    buf0 = torch.ops.mkl._mkl_linear(arg1_1, constant1, constant0, None, 4)
    del arg1_1
    buf1 = reinterpret_tensor(buf0, (4, 4, 4), (16, 4, 1)); del buf0  # reuse
    cpp_fused_view_0(c_void_p(buf1.data_ptr()))
    return (buf1, )
```

For the ExternKernelAlloc buffer, we can do a real view, rather than a memory copy.

Pull Request resolved: pytorch#108635
Approved by: https://github.com/jgong5, https://github.com/desertfire, https://github.com/jansel
ghstack dependencies: pytorch#108560

fix manual_seed

use deterministic manual seed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

None yet

6 participants