Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions test/inductor/test_kernel_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,25 @@ def f(a, b):
# = 0.022
self.check_bandwidth(compiled_module, "0.022")

def test_star_dep(self):
"""
Test the bandwidth estimation for StarDep
"""

@torch.compile
def f(a, b):
a[b] = 3.0

a = torch.rand(10000, 5000, device=GPU_TYPE)
b = torch.randint(
0, 10000, [20000], device=GPU_TYPE, dtype=torch.int32
).unsqueeze(1)
f(a, b)
compiled_module = self.get_compiled_module()
# 20000 * 4 = 80KB for b
# 20000 * 5000 * 4 = 200MB for a
self.check_bandwidth(compiled_module, "0.200")


if __name__ == "__main__":
if HAS_GPU:
Expand Down
11 changes: 8 additions & 3 deletions torch/_inductor/codegen/triton.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
from ..._dynamo.utils import counters
from .. import config, ir, scheduler
from ..codecache import code_hash, get_path, PyCodeCache
from ..dependencies import Dep, MemoryDep, StarDep
from ..dependencies import Dep, MemoryDep, StarDep, WeakDep
from ..ir import IRNode, ReductionHint, TritonTemplateBuffer
from ..optimize_indexing import indexing_dtype_strength_reduction
from ..scheduler import BaseSchedulerNode, BaseScheduling, WhyNoFuse
Expand Down Expand Up @@ -2620,9 +2620,14 @@ def estimate_kernel_num_bytes(self):
# This arg points to a buf that has been sliced.
# We need to count each individual slice to have
# a better estimation.
indices = set()
indices: Set[Any] = set()
no_index_dep_count = 0
for dep in self.buf_accesses[arg]:
indices.add(dep.index)
if isinstance(dep, (StarDep, WeakDep)):
indices.add(f"no_index_dep_{no_index_dep_count}")
no_index_dep_count += 1
else:
indices.add(dep.index)
numel = len(indices) * out_numel
else:
numel = buf_size
Expand Down