In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [2]:
!nvidia-smi

Tue May 13 04:50:43 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   37C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
!curl -ssL https://magic.modular.com/ | bash

Installing the latest version of Magic...
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:--  0:00:02 --:--:--     0
100 49.9M  100 49.9M    0     0  12.8M      0  0:00:03  0:00:03 --:--:-- 39.7M
Done. The 'magic' binary is in '/root/.modular/bin'

Two more steps:
1. To use 'magic', run this command so it's in your PATH:
source /root/.bashrc
2. To build with MAX and Mojo, go to http://modul.ar/get-started


In [4]:
import os
os.environ['PATH'] += ':/root/.modular/bin'

In [5]:
!magic init mojo_kernels --format mojoproject

[32m✔ [0mCreated /content/mojo_kernels/mojoproject.toml


In [6]:
%cd mojo_kernels/

/content/mojo_kernels


In [49]:
%%writefile sum_1d_tensor_kernel.mojo

from gpu import thread_idx, block_idx, grid_dim
from gpu.host import DeviceContext
from math import iota
from layout import Layout, LayoutTensor
from sys.intrinsics import _GridDim, _BlockIdx
from algorithm import vectorize
from sys import simdwidthof

# The data type of tensor elements
alias dtype = DType.uint32
# The number of thread blocks
alias blocks = 20
# We launch one thread per block which is responsible for summing up section of the input 1d tensor
alias threads = 1
# Count of elements in the tensor (2 ^ 10) + 3
# The left over elements are summed by the thread in the last block
alias elems_count = (1 << 10) + 3
# Row major layout of the tensor elements
alias layout = Layout.row_major(1, elems_count)
alias output_layout = Layout.row_major(blocks)


# The summer function takes a range of elements from the input tensor and sums up a section starting at
# `start` and ending at `end`
# It makes use of vectorize in-built function from mojo standard libray
fn summer[
    type: DType, layout: Layout, //, simdwidth: Int = simdwidthof[type]()
](
    tensor: LayoutTensor[type, layout, MutableAnyOrigin],
    start: Int = 0,
    end: Int = layout.size(),
) -> Scalar[type]:
    result = Scalar[type](0)

    @parameter
    fn sum[simd_width: Int](idx: Int):
        result += tensor.load[width=simd_width](0, start + idx).reduce_add()

    vectorize[sum, simdwidth](end - start)
    return result


# Calculate the `start` and `end` indices of section of the 1d input tensor that block in_tensor
#  responsible for summing up
fn block_indices(
    in_tensor: LayoutTensor,
    griddim: _GridDim,
    blockidx: _BlockIdx,
) -> (Int, Int):
    elems_per_block = in_tensor.size() // griddim.x
    remaining = in_tensor.size() % griddim.x

    start = elems_per_block * blockidx.x
    end = start + elems_per_block

    if blockidx.x == griddim.x - 1:
        end += remaining  # Handle leftover elements in last block

    return start, end


# The mojo kernel
fn sum_1d_tensor_kernel(
    in_tensor: LayoutTensor[dtype, layout, MutableAnyOrigin],
    out_tensor: LayoutTensor[dtype, output_layout, MutableAnyOrigin],
):
    start, end = block_indices(in_tensor, grid_dim, block_idx)
    result = summer(in_tensor, start, end)
    out_tensor[block_idx.x] = result


def main():
    var ctx = DeviceContext()
    var device_input_buff = ctx.enqueue_create_buffer[dtype](elems_count)
    var device_output_buff = ctx.enqueue_create_buffer[dtype](blocks)

    _ = device_output_buff.enqueue_fill(0)

    with device_input_buff.map_to_host() as host_input_buff:
        iota(host_input_buff.unsafe_ptr(), elems_count)
        print(host_input_buff)

        in_tensor = LayoutTensor[dtype, layout, MutableAnyOrigin](
            device_input_buff
        )
        out_tensor = LayoutTensor[dtype, output_layout, MutableAnyOrigin](
            device_output_buff
        )

    ctx.enqueue_function[sum_1d_tensor_kernel](
        in_tensor, out_tensor, grid_dim=blocks, block_dim=threads
    )

    ctx.synchronize()
    print("Output buff")
    with device_output_buff.map_to_host() as host_output_buff:
        print(host_output_buff)
        result_tensor = LayoutTensor[
            dtype, Layout.row_major(1, blocks), MutableAnyOrigin
        ](host_output_buff)
        result = summer(result_tensor)
        print(result)


Overwriting sum_1d_tensor_kernel.mojo


In [47]:
!magic run mojo sum_1d_tensor_kernel.mojo

[32m⠁[0m                                                                               [2K[32m⠁[0m activating environment                                                        [2K[32m⠁[0m activating environment                                                        [2KHostBuffer([0, 1, 2, ..., 1024, 1025, 1026])
Output buff
HostBuffer([1275, 3876, 6477, 9078, 11679, 14280, 16881, 19482, 22083, 24684, 27285, 29886, 32487, 35088, 37689, 40290, 42891, 45492, 48093, 57855])
526851


In [48]:
!magic run mojo format sum_1d_tensor_kernel.mojo

[32m⠁[0m                                                                               [2K[32m⠁[0m activating environment                                                        [2K[32m⠁[0m activating environment                                                        [2K[1mreformatted sum_1d_tensor_kernel.mojo[0m

[1mAll done! ✨ 🍰 ✨[0m
[34m[1m1 file [0m[1mreformatted[0m.


In [None]:
8 + 9 + 10 + 11 + 12 + 13 + 14 + 15

92

In [None]:
92 + 156 + 28 + 319 - 32 - 33 - 34

496