From 403902217e93b26ab6ed978bd79467aab9e470ba Mon Sep 17 00:00:00 2001
From: Astarag Mohapatra <40949756+Athe-kunal@users.noreply.github.com>
Date: Wed, 29 Oct 2025 00:48:15 -0700
Subject: [PATCH 1/5] Enable gradient tracking for 'y' in
 mul_relu_block_back_spec

I was getting an error. Resolved after enabling gradient checking for y

```
RuntimeError: One of the differentiated Tensors does not require grad
```
---
 docs/helion_puzzles.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/helion_puzzles.rst b/docs/helion_puzzles.rst
index 0a25d5ee0..af2d28ee9 100644
--- a/docs/helion_puzzles.rst
+++ b/docs/helion_puzzles.rst
@@ -269,6 +269,7 @@ While PyTorch and torch.compile automatically generates the backwards pass for y
         x = x.clone()
         y = y.clone()
         x = x.requires_grad_(True)
+        y = y.requires_grad_(True)
         z = torch.relu(x * y[:, None])
         grad_x, grad_y = torch.autograd.grad(z, [x, y], dz, retain_graph=True)
         return grad_x

From 59a621f9b6f8aec94620fba3732aa72751d25338 Mon Sep 17 00:00:00 2001
From: Astarag Mohapatra <40949756+Athe-kunal@users.noreply.github.com>
Date: Thu, 30 Oct 2025 23:51:56 -0700
Subject: [PATCH 2/5] Update accumulator initialization in helion_puzzles.rst

Change accumulator initialization to use the device of input tensor. And use `zeroes` instead of `zeroes_like` because the alter accepts tensor
---
 docs/helion_puzzles.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/helion_puzzles.rst b/docs/helion_puzzles.rst
index af2d28ee9..bd463a7ab 100644
--- a/docs/helion_puzzles.rst
+++ b/docs/helion_puzzles.rst
@@ -326,7 +326,7 @@ Sum of a batch of numbers.
         # Use Helion to tile the batch dimension
         for tile_batch in hl.tile(batch):
             # Initialize accumulator for each batch element
-            acc = torch.zeros_like(tile_batch, dtype=torch.float32)
+            acc = torch.zeros(tile_batch, dtype=torch.float32, device=x.device)
 
             # Process the sequence in chunks
             for tile_seq in hl.tile(seq_len):

From 50d4e51598b84cff667f2c9788d109ebd00c8c40 Mon Sep 17 00:00:00 2001
From: Astarag Mohapatra <40949756+Athe-kunal@users.noreply.github.com>
Date: Mon, 3 Nov 2025 09:04:56 -0800
Subject: [PATCH 3/5] Replace torch.max with torch.amax

torch.max is not supported for helion kernels
---
 docs/helion_puzzles.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/helion_puzzles.rst b/docs/helion_puzzles.rst
index bd463a7ab..0867872d3 100644
--- a/docs/helion_puzzles.rst
+++ b/docs/helion_puzzles.rst
@@ -433,7 +433,7 @@ A scalar version of FlashAttention.
                 scores = q_tile[:, None] * k_tile[None, :]
 
                 # Find max for numerical stability
-                batch_max = torch.max(scores, dim=1)[0]
+                batch_max = torch.amax(scores, dim=1)
                 new_max = torch.maximum(max_val, batch_max)
 
                 # Scale old accumulations

From cd0fb69fc392796ad22b4c20e6eb3313b1311dfc Mon Sep 17 00:00:00 2001
From: Astarag Mohapatra <40949756+Athe-kunal@users.noreply.github.com>
Date: Fri, 7 Nov 2025 22:18:25 -0800
Subject: [PATCH 4/5] Modify tensor initialization and padding for device
 compatibility

Updated tensor operations to ensure compatibility with input device.
---
 docs/helion_puzzles.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/helion_puzzles.rst b/docs/helion_puzzles.rst
index 0867872d3..a34c0de08 100644
--- a/docs/helion_puzzles.rst
+++ b/docs/helion_puzzles.rst
@@ -468,8 +468,8 @@ A batched 2D convolution.
 .. code-block:: python
 
     def conv2d_spec(x: Float32[Tensor, "4 8 8"], k: Float32[Tensor, "4 4"]) -> Float32[Tensor, "4 8 8"]:
-        z = torch.zeros(4, 8, 8)
-        x = torch.nn.functional.pad(x, (0, 4, 0, 4, 0, 0), value=0.0)
+        z = torch.zeros(4, 8, 8).to(x.device)
+        x = torch.nn.functional.pad(x, (0, 4, 0, 4, 0, 0), value=0.0).to(x.device)
         for i in range(8):
             for j in range(8):
                 z[:, i, j] = (k[None, :, :] * x[:, i: i+4, j: j + 4]).sum(1).sum(1)
@@ -495,7 +495,7 @@ A batched 2D convolution.
                     # Extract the patch
                     patch = x_padded[tile_batch, i:i+kh, j:j+kw]
                     # Apply the kernel
-                    out[tile_batch, i, j] = (k[tile_batch] * patch).sum([1, 2])
+                    out[tile_batch, i, j] = (k[tile_batch,:,:] * patch).sum([1, 2])
 
         return out
 

From 2decb8ca8b4eb8a998bb849d1d966cdf9ccd7168 Mon Sep 17 00:00:00 2001
From: Astarag Mohapatra <40949756+Athe-kunal@users.noreply.github.com>
Date: Sat, 8 Nov 2025 15:07:51 -0800
Subject: [PATCH 5/5] Refactor tensor initialization in conv2d_spec

---
 docs/helion_puzzles.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/helion_puzzles.rst b/docs/helion_puzzles.rst
index a34c0de08..3045df48c 100644
--- a/docs/helion_puzzles.rst
+++ b/docs/helion_puzzles.rst
@@ -468,8 +468,8 @@ A batched 2D convolution.
 .. code-block:: python
 
     def conv2d_spec(x: Float32[Tensor, "4 8 8"], k: Float32[Tensor, "4 4"]) -> Float32[Tensor, "4 8 8"]:
-        z = torch.zeros(4, 8, 8).to(x.device)
-        x = torch.nn.functional.pad(x, (0, 4, 0, 4, 0, 0), value=0.0).to(x.device)
+        z = torch.zeros(4, 8, 8, device=x.device)
+        x = torch.nn.functional.pad(x, (0, 4, 0, 4, 0, 0), value=0.0)
         for i in range(8):
             for j in range(8):
                 z[:, i, j] = (k[None, :, :] * x[:, i: i+4, j: j + 4]).sum(1).sum(1)