pytorch · Chillee · Oct 24, 2024 · Oct 24, 2024 · vadimkantorov · Oct 24, 2024
diff --git a/aten/src/ATen/functorch/BatchRulesScatterOps.cpp b/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
@@ -235,7 +235,7 @@ std::tuple<Tensor, std::optional<int64_t>> index_batch_rule(
   bool advanced_indices_are_adjacent = are_advanced_indices_adjacent(indices);
 
   // Step 1
-  const auto batched_indices = batchIndices(indices, indices_bdims, self_.size(0), self_bdim);
+  const auto batched_indices = batchIndices(indices, indices_bdims, self_.sym_size(0), self_bdim);
   auto num_leading_nones = get_num_leading_nones(indices);
   auto max_index_dim = get_max_index_logical_dim(indices, indices_bdims);
 
@@ -841,26 +841,26 @@ std::tuple<Tensor, std::optional<int64_t>> gather_batch_rule(
   return std::make_tuple(result, 0);
 }
 
-Tensor get_expanded_index(const Tensor& index, IntArrayRef self_size, int64_t dim) {
+Tensor get_expanded_index(const Tensor& index, SymIntArrayRef self_size, int64_t dim) {
   if (index.dim() == 0) {
-    return index.expand(self_size);
+    return index.expand_symint(self_size);
   }
   dim = maybe_wrap_dim(dim, static_cast<int64_t>(self_size.size()));
 
   // setup new_index_shape as [BS, 1, ..., idx_size, ..., 1]
   // to reshape index_
-  auto idx_size = index.size(0);  // get non-batch size of index tensor
+  auto idx_size = index.sym_size(0);  // get non-batch size of index tensor
   Tensor index_;
   {
-    VmapDimVector new_index_shape(self_size.size(), 1);
+    VmapSymDimVector new_index_shape(self_size.size(), 1);
     new_index_shape[dim] = idx_size;
-    index_ = index.view(new_index_shape);
+    index_ = index.view_symint(new_index_shape);
   }
   // Now apply expand to index_
   {
-    VmapDimVector new_index_shape = {self_size.begin(), self_size.end()};
+    VmapSymDimVector new_index_shape = {self_size.begin(), self_size.end()};
     new_index_shape[dim] = idx_size;
-    index_ = index_.expand(new_index_shape);
+    index_ = index_.expand_symint(new_index_shape);
   }
   return index_;
 }
@@ -869,7 +869,7 @@ Tensor index_select_decomp(const Tensor &self, int64_t dim, const Tensor &index)
 {
   Tensor index_ = index;
   if (self.dim() > index.dim()) {
-    index_ = get_expanded_index(index, self.sizes(), dim);
+    index_ = get_expanded_index(index, self.sym_sizes(), dim);
   }
 
   auto result = at::gather(self, dim, index_);
@@ -893,7 +893,7 @@ Tensor index_copy_decomp(
 {
   Tensor index_ = index;
   if (self.dim() > index.dim()) {
-    index_ = get_expanded_index(index, self.sizes(), dim);
+    index_ = get_expanded_index(index, self.sym_sizes(), dim);
   }
 
   return at::scatter(self, dim, index_, source);  ;
@@ -909,7 +909,7 @@ Tensor slice_scatter_decomp(const Tensor &self, const Tensor &src,
                             std::optional<int64_t> end, int64_t step)
 {
   auto idx = at::arange(start.value_or(0), end.value_or(self.size(dim)), step, self.options().dtype(kLong));
-  idx = get_expanded_index(idx, self.sizes(), dim);
+  idx = get_expanded_index(idx, self.sym_sizes(), dim);
   return at::scatter(self, dim, idx, src);
 }
 

diff --git a/test/inductor/test_flex_attention.py b/test/inductor/test_flex_attention.py
@@ -2567,6 +2567,51 @@ def mask_mod(b, h, q, kv):
         ):
             torch.compile(flex_attention)(query, key, value, block_mask=block_mask)
 
+    @supported_platform
+    def test_free_symbol_dynamic(self):
+        def batch_flip_causal(b, h, q_idx, kv_idx):
+            return (q_idx >= kv_idx) & (b % 2 == 0)
+
+        class SimpleAttention(torch.nn.Module):
+            def __init__(self, dim=512, n_head=8):
+                super().__init__()
+                self.qkv = torch.nn.Linear(dim, 3 * dim)
+                self.n_head = n_head
+                self.head_dim = dim // n_head
+
+            def forward(self, x, block_mask=None):
+                B, T, C = x.size()
+                qkv = self.qkv(x).view(B, T, 3, self.n_head, self.head_dim)
+                qkv = qkv.permute(2, 0, 3, 1, 4)
+                q, k, v = qkv
+                y = flex_attention(q, k, v, block_mask=block_mask)
+                return y.transpose(1, 2).contiguous().view(B, T, C)
+
+        model = SimpleAttention().cuda()
+        model.compile(mode="default", dynamic=True)
+        sequence_len = 256
+
+        # Test different batch shapes with dense masks
+        torch._dynamo.reset()
+        for batch_shape in [4, 16, 32]:
+            # Create dense mask
+            rand_mask = torch.randint(0, 2, (batch_shape, sequence_len)).cuda().bool()
+            block_mask = torch.compile(create_block_mask, dynamic=True)(
+                B=batch_shape,
+                BLOCK_SIZE=128,
+                mask_mod=lambda b, h, q_idx, kv_idx: ~rand_mask[b, q_idx],
+                H=None,
+                Q_LEN=sequence_len,
+                KV_LEN=sequence_len,
+                device="cuda",
+            )
+
+            # Run forward pass
+            x = torch.randn(batch_shape, sequence_len, 512).cuda()
+            y = model(x, block_mask=block_mask)
+
+        self.assertEqual(torch._dynamo.utils.counters["aot_autograd"]["ok"], 2)
+
     @supported_platform
     def test_fw_bw_graph_correctness(self):
         cnt = CompileCounterWithBackend("aot_eager")

diff --git a/torch/_inductor/select_algorithm.py b/torch/_inductor/select_algorithm.py
@@ -376,8 +376,14 @@ def modification(
             subgraph = self.subgraphs[subgraph_number]
 
             def add_input(name):
+                # This also implicitly adds name as an input to the kernel
                 return self.args.input(name)
 
+            def print_and_rename_indexing(index):
+                # This also implicitly adds the indexing symbols as an input to
+                # the kernel
+                return self.kexpr(self.rename_indexing(index))
+
             name = f"PlaceholderSubstitution_{subgraph_number}"
 
             class PlaceholderSubstitution(V.WrapperHandler):  # type: ignore[name-defined]
@@ -387,7 +393,7 @@ def load(self, name: str, index: sympy.Expr):
                     if name not in fixed_inputs:
                         # If it's not a fixed input, it's a load from a captured
                         # tensor
-                        index_str = outer_self.kexpr(index)
+                        index_str = print_and_rename_indexing(index)
                         var = add_input(name)
                         return f"tl.load({var} + {index_str})"