In [None]:
!nvcc --version

In [None]:
!nvidia-smi

In [1]:
!curl -ssL https://magic.modular.com/ | bash

Installing the latest version of Magic...
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:--  0:00:03 --:--:--     0
100 49.9M  100 49.9M    0     0  8330k      0  0:00:06  0:00:06 --:--:-- 33.3M
Done. The 'magic' binary is in '/root/.modular/bin'

Two more steps:
1. To use 'magic', run this command so it's in your PATH:
source /root/.bashrc
2. To build with MAX and Mojo, go to http://modul.ar/get-started


In [2]:
import os
os.environ['PATH'] += ':/root/.modular/bin'

In [3]:
!magic init mojo_kernels --format mojoproject

[32m✔ [0mCreated /content/mojo_kernels/mojoproject.toml


In [4]:
%cd mojo_kernels/

/content/mojo_kernels


In [None]:
!magic run mojo --version

In [17]:
%%writefile sum_reduce_kernel.mojo

from gpu import thread_idx, block_idx, barrier
from gpu.host import DeviceContext
from gpu.memory import AddressSpace
from layout import Layout, LayoutTensor
from math import iota
from memory import stack_allocation


alias dtype = DType.uint32
alias blocks = 4
alias threads = 4
alias layout = Layout.row_major(blocks, threads)
alias in_elements = blocks * threads
alias out_layout = Layout.row_major(in_elements)
alias InTensor = LayoutTensor[dtype, layout, MutableAnyOrigin]
alias OutTensor = LayoutTensor[dtype, out_layout, MutableAnyOrigin]


fn sum_reduce_kernel(in_tensor: InTensor, out_tensor: OutTensor):
    var shared = stack_allocation[
        threads, Scalar[dtype], address_space = AddressSpace.SHARED
    ]()
    shared[thread_idx.x] = in_tensor[block_idx.x, thread_idx.x][0]

    barrier()

    if thread_idx.x == 0:
        for i in range(threads):
            out_tensor[block_idx.x] += shared[i]


fn main() raises:
    ctx = DeviceContext()
    in_buffer = ctx.enqueue_create_buffer[dtype](in_elements)
    out_buffer = ctx.enqueue_create_buffer[dtype](blocks)
    _ = out_buffer.enqueue_fill(0)
    out_tensor = OutTensor(out_buffer)

    with in_buffer.map_to_host() as host_buffer:
        iota(host_buffer.unsafe_ptr(), in_elements)
        print(host_buffer)

    in_tensor = InTensor(in_buffer)

    ctx.enqueue_function[sum_reduce_kernel](
        in_tensor,
        out_tensor,
        grid_dim=blocks,
        block_dim=threads,
    )

    with out_buffer.map_to_host() as host_buffer:
        print(host_buffer)



Overwriting sum_reduce_kernel.mojo


In [18]:
!magic run mojo sum_reduce_kernel.mojo

[32m⠁[0m                                                                               [2K[32m⠁[0m activating environment                                                        [2K[32m⠁[0m activating environment                                                        [2KHostBuffer([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
HostBuffer([6, 22, 38, 54])


In [15]:
!magic run mojo format sum_reduce_kernel.mojo

[32m⠁[0m                                                                               [2K[32m⠁[0m activating environment                                                        [2K[32m⠁[0m activating environment                                                        [2K[1mreformatted sum_reduce_kernel.mojo[0m

[1mAll done! ✨ 🍰 ✨[0m
[34m[1m1 file [0m[1mreformatted[0m.
