**Install magic**

In [None]:
!curl -ssL https://magic.modular.com/ | bash

Installing the latest version of Magic...
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 49.9M  100 49.9M    0     0  30.3M      0  0:00:01  0:00:01 --:--:--  165M
Done. The 'magic' binary is in '/root/.modular/bin'

Two more steps:
1. To use 'magic', run this command so it's in your PATH:
source /root/.bashrc
2. To build with MAX and Mojo, go to http://modul.ar/get-started


**Update path to include magic**

In [None]:
import os
os.environ['PATH'] += ':/root/.modular/bin'

In [None]:
!nvcc --version

In [None]:
!nvidia-smi

**Create a new mojo project called 'mojo_kernels'**

In [None]:
!magic init mojo_kernels --format mojoproject

[32mâœ” [0mCreated /kaggle/working/mojo_kernels/mojoproject.toml


In [None]:
%cd mojo_kernels

/kaggle/working/mojo_kernels


**Write out the mojo kernel**

In [None]:
%%writefile buffer_add_kernel.mojo

from gpu import thread_idx, block_idx, block_dim
from gpu.host import DeviceContext, DeviceBuffer
from layout import Layout, LayoutTensor
from math import iota

alias dtype = DType.uint32
alias elem_count = 1 << 16
alias threads = 1 << 10
alias blocks = Int((elem_count + threads - 1) / threads);
alias data_layout = Layout.row_major(elem_count)
alias DataTensor = LayoutTensor[dtype, data_layout, MutableAnyOrigin]

fn buffer_add_kernel(input1: DataTensor, input2: DataTensor, output: DataTensor, elem_count: Int):
    #Calculate global thread ID
    tid = (block_idx.x * block_dim.x) + thread_idx.x;

    #Boundary check
    if tid < elem_count:
        output[tid] = input1[tid] + input2[tid];

fn main() raises:

    ctx = DeviceContext()

    var input_buffer_1 = ctx.enqueue_create_buffer[dtype](elem_count)
    var input_buffer_2 = ctx.enqueue_create_buffer[dtype](elem_count)
    var output_buffer = ctx.enqueue_create_buffer[dtype](elem_count)

    with input_buffer_1.map_to_host() as input_buff_1:
        iota(input_buff_1.unsafe_ptr(), elem_count)
    with input_buffer_2.map_to_host() as input_buff_2:
        iota(input_buff_2.unsafe_ptr(), elem_count)

    # Zero the values on the device as they'll be used to accumulate results
    _ = output_buffer.enqueue_fill(0)

    var input1 = DataTensor(input_buffer_1)
    var input2 = DataTensor(input_buffer_2)
    var output = DataTensor(output_buffer)


    ctx.enqueue_function[buffer_add_kernel](input1, input2,output,elem_count,
        grid_dim=blocks,
        block_dim=threads,
    )
    ctx.synchronize()

    with output_buffer.map_to_host() as host_buffer:
        print(host_buffer[65535])




Writing buffer_add_kernel.mojo


In [None]:
!magic run mojo buffer_add_kernel.mojo

[2K[32mâ [0m activating environment                                                        131070
