<a href="https://colab.research.google.com/github/ratulb/mojo_programming/blob/main/mojo_kernels/sum_1d_tensor_kernel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!nvcc --version

In [None]:
!nvidia-smi

In [None]:
!curl -ssL https://magic.modular.com/ | bash

In [None]:
import os
os.environ['PATH'] += ':/root/.modular/bin'

In [None]:
!magic init mojo_kernels --format mojoproject

In [None]:
%cd mojo_kernels/

In [None]:
%%writefile sum_1d_tensor_kernel.mojo

from gpu import thread_idx, block_idx, grid_dim
from gpu.host import DeviceContext
from math import iota
from layout import Layout, LayoutTensor
from sys.intrinsics import _GridDim, _BlockIdx
from algorithm import vectorize
from sys import simdwidthof


alias dtype = DType.uint32
alias blocks = 4
alias threads = 1
alias elems_count = (1 << 5) + 3
alias layout = Layout.row_major(1, elems_count)


fn summer[
    type: DType, layout: Layout, //, simdwidth: Int = simdwidthof[type]()
](
    tensor: LayoutTensor[type, layout, MutableAnyOrigin],
    start: Int = 0,
    end: Int = layout.size(),
) -> Scalar[type]:
    result = Scalar[type](0)

    @parameter
    fn sum[simd_width: Int](idx: Int):
        result += tensor.load[width=simd_width](0, start + idx).reduce_add()

    vectorize[sum, simdwidth](end - start)
    return result


fn block_indices(
    in_tensor: LayoutTensor,
    griddim: _GridDim,
    blockidx: _BlockIdx,
) -> (Int, Int):
    elems_per_block = in_tensor.size() // griddim.x
    remaining = in_tensor.size() % griddim.x

    start = elems_per_block * blockidx.x
    end = start + elems_per_block

    if blockidx.x == griddim.x - 1:
        end += remaining  # Handle leftover elements in last block

    return start, end

fn sum_1d_tensor_kernel(
    in_tensor: LayoutTensor[dtype, layout, MutableAnyOrigin]
):
    start, end = block_indices(in_tensor, grid_dim, block_idx)
    print(start, end)
    result = summer(in_tensor, start, end)
    print(result)


def main():
    var ctx = DeviceContext()
    var device_input_buff = ctx.enqueue_create_buffer[dtype](elems_count)

    with device_input_buff.map_to_host() as host_input_buff:
        iota(host_input_buff.unsafe_ptr(), elems_count)
        print(host_input_buff)

        in_tensor = LayoutTensor[dtype, layout, MutableAnyOrigin](
            device_input_buff
        )

    ctx.enqueue_function[sum_1d_tensor_kernel](
        in_tensor, grid_dim=blocks, block_dim=threads
    )

    ctx.synchronize()
    print("ok")


Overwriting sum_1d_tensor_kernel.mojo


In [None]:
!magic run mojo sum_1d_tensor_kernel.mojo

[32m⠁[0m                                                                               [2K[32m⠁[0m activating environment                                                        [2K[32m⠁[0m activating environment                                                        [2KHostBuffer([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34])
0 8
8 16
16 24
24 35
92
156
28
319
ok


In [None]:
!magic run mojo format sum_1d_tensor_kernel.mojo

[32m⠁[0m                                                                               [2K[32m⠁[0m activating environment                                                        [2K[32m⠁[0m activating environment                                                        [2K[1mreformatted sum_1d_tensor_kernel.mojo[0m

[1mAll done! ✨ 🍰 ✨[0m
[34m[1m1 file [0m[1mreformatted[0m.


In [None]:
8 + 9 + 10 + 11 + 12 + 13 + 14 + 15

92

In [None]:
92 + 156 + 28 + 319 - 32 - 33 - 34

496