pytorch · jansel · Aug 8, 2022 · Aug 7, 2022 · Aug 7, 2022 · Aug 8, 2022
diff --git a/tests/test_torchinductor.py b/tests/test_torchinductor.py
@@ -1426,7 +1426,7 @@ def fn(mask, value):
             ),
         )
 
-    def test_pow(self):
+    def test_pow1(self):
         def fn(x):
             return [aten.pow(x, e) for e in range(-8, 9)]
 
@@ -1435,6 +1435,15 @@ def fn(x):
             (torch.randn([16, 16]),),
         )
 
+    def test_pow2(self):
+        def fn(x):
+            return aten.pow(1000, x), aten.pow(x, 1000)
+
+        self.common(
+            fn,
+            (torch.randn([16, 16]),),
+        )
+
     def test_glu(self):
         def fn(x):
             return aten.glu(x, -1), aten.glu(x, 1), aten.glu(x, 2)

diff --git a/torchinductor/codegen/cpp.py b/torchinductor/codegen/cpp.py
@@ -140,6 +140,10 @@ def exp(x):
     def sqrt(x):
         return f"std::sqrt({x})"
 
+    @staticmethod
+    def pow(a, b):
+        return f"std::pow({a}, {b})"
+
     @staticmethod
     def log(x):
         return f"std::log({x})"

diff --git a/torchinductor/codegen/triton.py b/torchinductor/codegen/triton.py
@@ -155,6 +155,10 @@ def logical_or(a, b):
     def rand(seed, offset, _):  # _ here to keep the contract identical to CPU rand op
         return f"tl.rand({seed}, {offset})"
 
+    @staticmethod
+    def pow(a, b):
+        return f"tl.libdevice.pow({a}, {b})"
+
     @staticmethod
     def log(x):
         if has_triton_libdevice():
@@ -444,14 +448,17 @@ def initialize_range_tree(self, pid_cache):
     def disable_reduction(self):
         @contextlib.contextmanager
         def ctx():
-            if not self.inside_reduction:
+            if self.numels[-1] == 1:
+                assert not self.inside_reduction
                 yield
                 return
             # calling codegen_body() will flush all the pending buffers
             # and write out a reduction loop
             self.codegen_body()
             self.inside_reduction = False
             yield
+            # flush out any code before opening the next loop
+            self.codegen_body()
             self.inside_reduction = True
 
         return ctx()

diff --git a/torchinductor/decomposition.py b/torchinductor/decomposition.py
@@ -202,10 +202,12 @@ def rsub(a, b):
     return b - a
 
 
-@register_decomposition([aten.masked_fill.Scalar])
+@register_decomposition([aten.masked_fill])
 def masked_fill(value, mask, other):
     if isinstance(other, numbers.Number):
         other = torch.tensor(other, dtype=value.dtype, device=value.device)
+    if other.device != value.device and other.numel() == 1:
+        other = other.to(value.device)
     value, mask, other = torch.broadcast_tensors(value, mask, other)
     return torch.where(mask, other, value)
 

diff --git a/torchinductor/lowering.py b/torchinductor/lowering.py
@@ -271,12 +271,26 @@ def _to_copy(
 
 
 @register_lowering(aten.to)
-def to(x, device_or_dtype, non_blocking=False, copy=False, memory_format=None):
+def to(
+    x,
+    device_or_dtype=None,
+    non_blocking=False,
+    copy=False,
+    memory_format=None,
+    device=None,
+    dtype=None,
+    layout=None,
+):
     assert not memory_format, "TODO"
+    assert layout in (None, torch.strided)
     if isinstance(device_or_dtype, torch.dtype):
         return to_dtype(x, device_or_dtype)
     if isinstance(device_or_dtype, torch.device):
         return to_device(x, device_or_dtype)
+    if device is not None:
+        return to_device(x, device)
+    if dtype is not None:
+        return to_dtype(x, dtype)
     assert False, device_or_dtype
 
 
@@ -503,7 +517,6 @@ def roll(a, shifts, dims=tuple()):
             raise RuntimeError(
                 f"shifts and dimensions must align. shifts: {len_shifts}, dims: {len_dims}"
             )
-        assert len_dims > 1
         tail_shifts = shifts[1:]
         tail_dims = dims[1:]
         first_dim_rolled = roll(a, shifts[0], dims[0])
@@ -531,10 +544,13 @@ def fn(index):
 
 @register_lowering(aten.as_strided)
 def as_strided(x, size, stride, storage_offset=None):
+    if isinstance(x, TensorBox) and isinstance(x.data, ir.BaseView):
+        # as_strided ignores views
+        x = x.data.unwrap_view()
     x.realize()
-    if not ir.is_storage_and_layout(x):
+    if not ir.is_contiguous_storage_and_layout(x):
         raise NotImplementedError(f"unrealized as_strided({x}, ...)")
-    storage, old_layout = ir.as_storage_and_layout(x)
+    storage, old_layout = ir.as_contiguous_storage_and_layout(x)
     new_layout = ir.FixedLayout(
         old_layout.device,
         old_layout.dtype,
@@ -851,6 +867,13 @@ def arange(
         end = start
         start = 0
 
+    if isinstance(start, float) and int(start) == start:
+        start = int(start)
+    if isinstance(end, float) and int(end) == end:
+        end = int(end)
+    if isinstance(step, float) and int(step) == step:
+        step = int(step)
+
     assert isinstance(start, int)
     assert isinstance(end, int)
     assert isinstance(step, int)
@@ -2367,16 +2390,19 @@ def pow_recursive(x, y, dtype):
     return result
 
 
+@make_pointwise
+def pow_native(a, b):
+    return ops.pow(a, b)
+
+
 @register_lowering(aten.pow, broadcast=True)
 def pow(a, b):
-    # see https://github.com/openai/triton/issues/506
-    # triton doesn't support pow, so need to rewrite it
-    # this is a lowering not a decomp, due to upstream pytorch being unstable
     if isinstance(b, float) and b == int(b):
         return pow(a, int(b))
     elif isinstance(b, int) and b == 1:
         return a
-    elif isinstance(b, int):
+    elif isinstance(b, int) and -32 < b < 32:
+        # Optimize away small fixed powers
         loader = a.make_loader()
 
         def fn(idx):
@@ -2389,10 +2415,7 @@ def fn(idx):
             ranges=a.get_size(),
         )
     else:
-        assert False, "TODO: check correctness here"
-        # odd integer: torch.sign(a) * torch.exp(torch.log(torch.abs(a)) * b)
-        # even integer: torch.exp(torch.log(torch.abs(a)) * b)
-        # else: torch.exp(torch.log(a) * b)
+        return pow_native(a, b)
 
 
 def mutate_to(changed, val):

diff --git a/torchinductor/sizevars.py b/torchinductor/sizevars.py
@@ -1,6 +1,7 @@
 import collections
 import dataclasses
 import functools
+import logging
 from typing import Dict
 from typing import List
 
@@ -11,6 +12,8 @@
 
 from .virtualized import V
 
+log = logging.getLogger(__name__)
+
 
 @dataclasses.dataclass
 class ZeroGuard:
@@ -278,7 +281,13 @@ def stride_hints(self, index: sympy.Expr, vars: List[sympy.Symbol]):
         for v in index.free_symbols:
             if str(v).startswith("indirect"):
                 index = index.subs({v: 0})
-        return [self.size_hint(s) for s in self.stride_vars(index, vars)]
+        result = []
+        for s in self.stride_vars(index, vars):
+            try:
+                result.append(self.size_hint(s))
+            except TypeError:
+                result.append(0)
+        return result
 
     def stride_order(self, index: sympy.Expr, vars: List[sympy.Symbol]):
         strides = tuple(map(abs, self.stride_hints(index, vars)))