<a href="https://colab.research.google.com/github/ratulb/mojo_programming/blob/main/gpu_puzzles/broadcast_add_ptr.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!curl -ssL https://magic.modular.com/ | bash

In [None]:
import os
os.environ['PATH'] += ':/root/.modular/bin'

In [None]:
!magic init gpu_puzzles --format mojoproject

In [None]:
%cd gpu_puzzles/

In [36]:
%%writefile broadcast_add_ptr.mojo

### Broadcast Addiotion
### Add 2 vectors

from gpu import thread_idx
from gpu.host import DeviceContext
from memory import UnsafePointer
from testing import assert_equal


alias SIZE = 3
alias dtype = DType.float32
alias BLOCKS_PER_GRID = 1
alias THREADS_PER_BLOCK = (3, 3)


fn broadcast_add_ptr(
    out: UnsafePointer[Scalar[dtype]],
    a: UnsafePointer[Scalar[dtype]],
    b: UnsafePointer[Scalar[dtype]],
):
    row = thread_idx.y
    col = thread_idx.x
    if row < SIZE and col < SIZE:
        out[row * SIZE + col] = a[row] + b[col]


fn main() raises:
    with DeviceContext() as ctx:
        out = ctx.enqueue_create_buffer[dtype](SIZE * SIZE).enqueue_fill(0)
        expected = ctx.enqueue_create_host_buffer[dtype](
            SIZE * SIZE
        ).enqueue_fill(0)
        a = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(0)
        b = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(0)

        with a.map_to_host() as a_host, b.map_to_host() as b_host:
            for i in range(SIZE):
                a_host[i] = i
                b_host[i] = i
            print(a_host)
            print(a_host)
            for i in range(SIZE):
                for j in range(SIZE):
                    expected[i * SIZE + j] = a_host[i] + b_host[j]
            print(expected)

        ctx.enqueue_function[broadcast_add_ptr](
            out.unsafe_ptr(),
            a.unsafe_ptr(),
            b.unsafe_ptr(),
            SIZE,
            grid_dim=BLOCKS_PER_GRID,
            block_dim=THREADS_PER_BLOCK,
        )
        ctx.synchronize()

        with out.map_to_host() as out_host:
            print(out_host)
            for i in range(SIZE):
                for j in range(SIZE):
                    assert_equal(out_host[i * SIZE + j], expected[i * SIZE + j])


Overwriting broadcast_add_ptr.mojo


In [37]:
!magic run mojo broadcast_add_ptr.mojo

[32m⠁[0m                                                                               [2K[32m⠁[0m activating environment                                                        [2K[32m⠁[0m activating environment                                                        [2KHostBuffer([0.0, 1.0, 2.0])
HostBuffer([0.0, 1.0, 2.0])
HostBuffer([0.0, 1.0, 2.0, 1.0, 2.0, 3.0, 2.0, 3.0, 4.0])
HostBuffer([0.0, 1.0, 2.0, 1.0, 2.0, 3.0, 2.0, 3.0, 4.0])


In [35]:
!magic run mojo format broadcast_add_ptr.mojo

[32m⠁[0m                                                                               [2K[32m⠁[0m activating environment                                                        [2K[32m⠁[0m activating environment                                                        [2K[1mreformatted broadcast_add_ptr.mojo[0m

[1mAll done! ✨ 🍰 ✨[0m
[34m[1m1 file [0m[1mreformatted[0m.
