From 403902217e93b26ab6ed978bd79467aab9e470ba Mon Sep 17 00:00:00 2001 From: Astarag Mohapatra <40949756+Athe-kunal@users.noreply.github.com> Date: Wed, 29 Oct 2025 00:48:15 -0700 Subject: [PATCH 1/5] Enable gradient tracking for 'y' in mul_relu_block_back_spec I was getting an error. Resolved after enabling gradient checking for y ``` RuntimeError: One of the differentiated Tensors does not require grad ``` --- docs/helion_puzzles.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/helion_puzzles.rst b/docs/helion_puzzles.rst index 0a25d5ee0..af2d28ee9 100644 --- a/docs/helion_puzzles.rst +++ b/docs/helion_puzzles.rst @@ -269,6 +269,7 @@ While PyTorch and torch.compile automatically generates the backwards pass for y x = x.clone() y = y.clone() x = x.requires_grad_(True) + y = y.requires_grad_(True) z = torch.relu(x * y[:, None]) grad_x, grad_y = torch.autograd.grad(z, [x, y], dz, retain_graph=True) return grad_x From 59a621f9b6f8aec94620fba3732aa72751d25338 Mon Sep 17 00:00:00 2001 From: Astarag Mohapatra <40949756+Athe-kunal@users.noreply.github.com> Date: Thu, 30 Oct 2025 23:51:56 -0700 Subject: [PATCH 2/5] Update accumulator initialization in helion_puzzles.rst Change accumulator initialization to use the device of input tensor. And use `zeroes` instead of `zeroes_like` because the alter accepts tensor --- docs/helion_puzzles.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/helion_puzzles.rst b/docs/helion_puzzles.rst index af2d28ee9..bd463a7ab 100644 --- a/docs/helion_puzzles.rst +++ b/docs/helion_puzzles.rst @@ -326,7 +326,7 @@ Sum of a batch of numbers. # Use Helion to tile the batch dimension for tile_batch in hl.tile(batch): # Initialize accumulator for each batch element - acc = torch.zeros_like(tile_batch, dtype=torch.float32) + acc = torch.zeros(tile_batch, dtype=torch.float32, device=x.device) # Process the sequence in chunks for tile_seq in hl.tile(seq_len): From 50d4e51598b84cff667f2c9788d109ebd00c8c40 Mon Sep 17 00:00:00 2001 From: Astarag Mohapatra <40949756+Athe-kunal@users.noreply.github.com> Date: Mon, 3 Nov 2025 09:04:56 -0800 Subject: [PATCH 3/5] Replace torch.max with torch.amax torch.max is not supported for helion kernels --- docs/helion_puzzles.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/helion_puzzles.rst b/docs/helion_puzzles.rst index bd463a7ab..0867872d3 100644 --- a/docs/helion_puzzles.rst +++ b/docs/helion_puzzles.rst @@ -433,7 +433,7 @@ A scalar version of FlashAttention. scores = q_tile[:, None] * k_tile[None, :] # Find max for numerical stability - batch_max = torch.max(scores, dim=1)[0] + batch_max = torch.amax(scores, dim=1) new_max = torch.maximum(max_val, batch_max) # Scale old accumulations From cd0fb69fc392796ad22b4c20e6eb3313b1311dfc Mon Sep 17 00:00:00 2001 From: Astarag Mohapatra <40949756+Athe-kunal@users.noreply.github.com> Date: Fri, 7 Nov 2025 22:18:25 -0800 Subject: [PATCH 4/5] Modify tensor initialization and padding for device compatibility Updated tensor operations to ensure compatibility with input device. --- docs/helion_puzzles.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/helion_puzzles.rst b/docs/helion_puzzles.rst index 0867872d3..a34c0de08 100644 --- a/docs/helion_puzzles.rst +++ b/docs/helion_puzzles.rst @@ -468,8 +468,8 @@ A batched 2D convolution. .. code-block:: python def conv2d_spec(x: Float32[Tensor, "4 8 8"], k: Float32[Tensor, "4 4"]) -> Float32[Tensor, "4 8 8"]: - z = torch.zeros(4, 8, 8) - x = torch.nn.functional.pad(x, (0, 4, 0, 4, 0, 0), value=0.0) + z = torch.zeros(4, 8, 8).to(x.device) + x = torch.nn.functional.pad(x, (0, 4, 0, 4, 0, 0), value=0.0).to(x.device) for i in range(8): for j in range(8): z[:, i, j] = (k[None, :, :] * x[:, i: i+4, j: j + 4]).sum(1).sum(1) @@ -495,7 +495,7 @@ A batched 2D convolution. # Extract the patch patch = x_padded[tile_batch, i:i+kh, j:j+kw] # Apply the kernel - out[tile_batch, i, j] = (k[tile_batch] * patch).sum([1, 2]) + out[tile_batch, i, j] = (k[tile_batch,:,:] * patch).sum([1, 2]) return out From 2decb8ca8b4eb8a998bb849d1d966cdf9ccd7168 Mon Sep 17 00:00:00 2001 From: Astarag Mohapatra <40949756+Athe-kunal@users.noreply.github.com> Date: Sat, 8 Nov 2025 15:07:51 -0800 Subject: [PATCH 5/5] Refactor tensor initialization in conv2d_spec --- docs/helion_puzzles.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/helion_puzzles.rst b/docs/helion_puzzles.rst index a34c0de08..3045df48c 100644 --- a/docs/helion_puzzles.rst +++ b/docs/helion_puzzles.rst @@ -468,8 +468,8 @@ A batched 2D convolution. .. code-block:: python def conv2d_spec(x: Float32[Tensor, "4 8 8"], k: Float32[Tensor, "4 4"]) -> Float32[Tensor, "4 8 8"]: - z = torch.zeros(4, 8, 8).to(x.device) - x = torch.nn.functional.pad(x, (0, 4, 0, 4, 0, 0), value=0.0).to(x.device) + z = torch.zeros(4, 8, 8, device=x.device) + x = torch.nn.functional.pad(x, (0, 4, 0, 4, 0, 0), value=0.0) for i in range(8): for j in range(8): z[:, i, j] = (k[None, :, :] * x[:, i: i+4, j: j + 4]).sum(1).sum(1)