In [4]:
from numba import cuda, float32
import numpy as np
import time

@cuda.jit
def initialize_matrix(matrix, random_values):
  i, j = cuda.grid(2)
  if i < matrix.shape[0] and j < matrix.shape[1]:
    matrix[i, j] = random_values[i, j]

def main():
  N = 4000
  random_values_cpu = np.random.rand(N, N).astype(np.float32)

  matrix_gpu = cuda.device_array((N, N), dtype=np.float32)
  random_values_gpu = cuda.to_device(random_values_cpu)

  threads_per_block = (16, 16)
  blocks = ((N + threads_per_block[0] - 1) // threads_per_block[0],
            (N + threads_per_block[1] - 1) // threads_per_block[1])

  start_time = time.time()
  initialize_matrix[blocks, threads_per_block](matrix_gpu, random_values_gpu)
  cuda.synchronize()
  end_time = time.time()

  elapsed_time = (end_time - start_time) * 1000
  print("Time taken by GPU:", elapsed_time, "milliseconds")

  matrix_result = matrix_gpu.copy_to_host()

if __name__ == "__main__":
  main()



Time taken by GPU: 99.55430030822754 milliseconds
