<a href="https://colab.research.google.com/github/ratulb/mojo_programming/blob/main/gpu_puzzles/broadcast_add_layout.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!curl -ssL https://magic.modular.com/ | bash

In [None]:
import os
os.environ['PATH'] += ':/root/.modular/bin'

In [None]:
!magic init gpu_puzzles --format mojoproject

In [None]:
%cd gpu_puzzles/

In [14]:
%%writefile broadcast_add_layout.mojo

### Broadcast Addiotion
### Add 2 vectors

from gpu import thread_idx
from gpu.host import DeviceContext
from layout import Layout, LayoutTensor
from testing import assert_equal


alias SIZE = 3
alias dtype = DType.float32
alias BLOCKS_PER_GRID = 1
alias THREADS_PER_BLOCK = (3, 3)

alias layout_out = Layout.row_major(SIZE, SIZE)
alias layout_a = Layout.row_major(1, SIZE)
alias layout_b = Layout.row_major(SIZE, 1)



fn broadcast_add_layout[layout_out: Layout, layout_a: Layout, layout_b: Layout](
    out: LayoutTensor[mut=True, dtype, layout_out],
    a: LayoutTensor[mut=True, dtype, layout_a],
    b: LayoutTensor[mut=True, dtype, layout_b],
):
    row = thread_idx.y
    col = thread_idx.x
    if row < SIZE and col < SIZE:
        out[row, col] = a[0, row] + b[col, 0]


fn main() raises:
    with DeviceContext() as ctx:
        out_buffer = ctx.enqueue_create_buffer[dtype](SIZE * SIZE).enqueue_fill(0)
        expected_buffer = ctx.enqueue_create_host_buffer[dtype](
            SIZE * SIZE
        ).enqueue_fill(0)
        a_buffer = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(0)
        b_buffer = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(0)

        with a_buffer.map_to_host() as a_buffer_host, b_buffer.map_to_host() as b_buffer_host:
            for i in range(SIZE):
                a_buffer_host[i] = i
                b_buffer_host[i] = i
            print(a_buffer)
            print(b_buffer)
            for i in range(SIZE):
                for j in range(SIZE):
                    expected_buffer[i * SIZE + j] = a_buffer_host[i] + b_buffer_host[j]
            print(expected_buffer)

        out = LayoutTensor[mut=True, dtype, layout_out](out_buffer.unsafe_ptr())
        a =  LayoutTensor[mut=True, dtype, layout_a](a_buffer.unsafe_ptr())
        b = LayoutTensor[mut=True, dtype, layout_b](b_buffer.unsafe_ptr())
        expected = LayoutTensor[mut=True, dtype, layout_out](expected_buffer.unsafe_ptr())

        ctx.enqueue_function[broadcast_add_layout[layout_out, layout_a, layout_b]](
            out,
            a,
            b,
            SIZE,
            grid_dim=BLOCKS_PER_GRID,
            block_dim=THREADS_PER_BLOCK,
        )
        ctx.synchronize()

        with out_buffer.map_to_host() as out_buffer_host:
            print(out_buffer_host)
            for i in range(SIZE):
                for j in range(SIZE):
                    assert_equal(out_buffer_host[i * SIZE + j], expected_buffer[i * SIZE + j])


Overwriting broadcast_add_layout.mojo


In [15]:
!magic run mojo broadcast_add_layout.mojo

[32m⠁[0m                                                                               [2K[32m⠁[0m activating environment                                                        [2K[32m⠁[0m activating environment                                                        [2KDeviceBuffer([0.0, 0.0, 0.0])
DeviceBuffer([0.0, 0.0, 0.0])
HostBuffer([0.0, 1.0, 2.0, 1.0, 2.0, 3.0, 2.0, 3.0, 4.0])
HostBuffer([0.0, 1.0, 2.0, 1.0, 2.0, 3.0, 2.0, 3.0, 4.0])


In [None]:
!magic run mojo format broadcast_add_layout.mojo