In [23]:
import numpy as np
from numba import njit
from numba import cuda
import time
import math

In [24]:
arr = np.random.randint(10, size=(4096,4096))
def reverse2D(arr):
    for i in range(len(arr)):
        array_len = len(arr[i])
        start = 0
        end = array_len - 1
        while(start < end):
            arr[i][start], arr[i][end] = arr[i][end], arr[i][start]
            start += 1
            end -= 1
    return arr
            
%timeit reverse2D(arr)

8.26 s ± 64.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [25]:
@njit
def reverse2Dnjit(arr):
    for i in range(len(arr)):
        array_len = len(arr[i])
        start = 0
        end = array_len - 1
        while(start < end):
            arr[i][start], arr[i][end] = arr[i][end], arr[i][start]
            start += 1
            end -= 1
    return arr
            
%timeit reverse2Dnjit(arr)

13.4 ms ± 126 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [26]:
arr = np.random.randint(10, size=(4096,4096))
print(arr)

@cuda.jit
def my_kernel(arr):
    pos = cuda.grid(1)
    for pos in range(len(arr)):
        array_len = len(arr[pos])
        start = 0
        end = array_len - 1
        while (start < end):
            arr[pos][start], arr[pos][end] = arr[pos][end], arr[pos][start]
            start += 1
            end -= 1

threadsperblock = 256
blockspergrid = math.ceil(data.shape[0] / threadsperblock)
my_kernel[blockspergrid, threadsperblock](arr)


[[8 2 0 ... 2 4 5]
 [1 0 2 ... 7 1 5]
 [9 3 2 ... 2 8 5]
 ...
 [8 1 9 ... 3 4 6]
 [4 6 8 ... 6 1 6]
 [5 2 3 ... 1 5 2]]


In [27]:
%timeit my_kernel[blockspergrid, threadsperblock](arr)

1.77 s ± 484 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
