In [6]:
import pyopencl as cl
import numpy
import time

In [7]:

platform = cl.get_platforms()[0]
device = platform.get_devices()[0]
context = cl.Context([device])
queue = cl.CommandQueue(context)

N = 2048

a = numpy.random.rand(N, N).astype(numpy.float32)
b = numpy.random.rand(N, N).astype(numpy.float32)

a_buffer = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=a)
b_buffer = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=b)
c_buffer = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, a.nbytes)

kernel_code = """
__kernel void matrix_multiply(__global float* a, __global float* b, __global float* c, int N) {
    int row = get_global_id(0);
    int col = get_global_id(1);
    
    if (row < N && col < N) {
        float sum = 0.0;
        for (int i = 0; i < N; i++) {
            sum += a[row * N + i] * b[i * N + col];
        }
        c[row * N + col] = sum;
    }
}
"""
program = cl.Program(context, kernel_code).build()
start = time.time()

program.matrix_multiply(queue, a.shape, None, a_buffer, b_buffer, c_buffer, numpy.int32(N))

c = numpy.empty((N, N), dtype=numpy.float32)
cl.enqueue_copy(queue, c, c_buffer)
end = time.time()

print(c)
print(end- start)


[[512.805   520.81805 507.3237  ... 498.42325 515.8787  493.18442]
 [494.1038  497.87378 488.82632 ... 490.1228  497.6528  482.5827 ]
 [513.4464  515.1043  507.78064 ... 512.11694 522.31854 508.23407]
 ...
 [519.3453  518.00385 520.05164 ... 514.95764 525.53015 503.58734]
 [519.65424 524.8355  522.49615 ... 510.86127 530.4609  509.93924]
 [512.21423 512.2386  507.13354 ... 503.00583 512.8709  496.72263]]
5.960893869400024
