In [None]:
import pycuda.autoinit
import numpy as np
from pycuda import gpuarray
import pycuda.driver as drv
import pyculib

In [None]:
import numpy as np
import time
# import pycuda stuff
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule

BLOCK_SIZE = 16

n = 4
ni = np.int32(n)

# matrix A 
a = np.random.randn(n, n)*100
a = a.astype(np.float32)

# matrix B
b = np.random.randn(n, n)*100
b = b.astype(np.float32)

# matrix B
c = np.empty([n, n])
c = c.astype(np.float32)

# allocate memory on device
a_gpu = cuda.mem_alloc(a.nbytes)
b_gpu = cuda.mem_alloc(b.nbytes)
c_gpu = cuda.mem_alloc(c.nbytes)

# copy matrix to memory
cuda.memcpy_htod(a_gpu, a)
cuda.memcpy_htod(b_gpu, b)

# compile kernel
mod = SourceModule(open("test_kernels.cu", "r").read())

# get function
matmul = mod.get_function("matmul");


# set grid size
if n%BLOCK_SIZE != 0:
    grid=(n//BLOCK_SIZE+1,n//BLOCK_SIZE+1,1)
else:
    grid=(n//BLOCK_SIZE,n//BLOCK_SIZE,1)

# call gpu function
start = time.time()
matmul(ni, a_gpu, b_gpu, c_gpu, block=(BLOCK_SIZE,BLOCK_SIZE,1), grid=grid);
end = time.time()
print("Time: %.5f s"%(end-start))

# copy back the result
cuda.memcpy_dtoh(c, c_gpu)

print(np.linalg.norm(c - np.dot(a,b)))
print(c)
print(np.dot(a,b))
print(c - np.dot(a,b))

In [None]:
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda import gpuarray
from pycuda.compiler import SourceModule

n_x = 3
n_y = 2
a_np = np.zeros((n_x, n_y), dtype=np.float32)
a_gpuarray = gpuarray.to_gpu(a_np)
ni = np.int32(n_y)

mod = SourceModule(open("test_kernels.cu", "r").read())
addone = mod.get_function("addone")
addone(ni, a_gpuarray, block=(n_x, n_y, 1))

print(a_gpuarray.get())

In [None]:
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
import numpy as np
import skcuda.cublas as cublas

order = "F"
trans = "n"

A = np.array(([1, 2, 3], [4, 5, 6]), order=order).astype(np.float64)
B = np.array(([7, 8, 1, 5], [9, 10, 0, 9], [11, 12, 5, 5]), order=order).astype(np.float64)

A_gpu = gpuarray.to_gpu(A)
B_gpu = gpuarray.to_gpu(B)

m, k = A_gpu.shape
k, n = B_gpu.shape

C_gpu = gpuarray.empty((m, n), np.float64)

alpha = np.float64(1.0)
beta  = np.float64(0.0)

cublas_handle = cublas.cublasCreate()
cublas.cublasDgemm(cublas_handle, trans, trans, m, n, k, alpha, A_gpu.gpudata, m, B_gpu.gpudata, k, beta, C_gpu.gpudata, m)
cublas.cublasDestroy(cublas_handle)

C_gpu = C_gpu.reshape(C_gpu.shape, order=order)

print(np.dot(A, B))
print(C_gpu)


In [None]:
# This seems to work ok for C := A*B
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
import numpy as np
import skcuda.cublas as cublas


cublas_handle = cublas.cublasCreate()

alpha = 1.0
beta = 0.0

m = 3
k = 1
n = 5

A = np.array([[1],[2],[3]], dtype=np.float32)  # m x k
B = np.ones((k,n), dtype=np.float32)  # k x n
C = np.ones((m,n), dtype=np.float32)  # m x n
A_gpu = gpuarray.to_gpu(A)
B_gpu = gpuarray.to_gpu(B)
C_gpu = gpuarray.to_gpu(C.T.copy())

print("A")
print(A_gpu)
print("B")
print(B_gpu)

cublas.cublasSgemm(cublas_handle, 
                   cublas._CUBLAS_OP['T'],
                   cublas._CUBLAS_OP['T'],
                   m, n, k,
                   alpha,
                   A_gpu.gpudata, k,
                   B_gpu.gpudata, n,
                   beta,
                   C_gpu.gpudata, m)

print("C")
print(C_gpu.T)
print("Check")
print(np.dot(A, B.reshape((k,n))))

In [None]:
# Can we get broadcasting with batch sizes to work?
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
import numpy as np
import skcuda.cublas as cublas


cublas_handle = cublas.cublasCreate()

alpha = 1.0
beta = 0.0

m = 1
k = 4
n = 2

W = np.array([[1, 2, 3, 4]], dtype=np.float32)  # m x k

X = np.ones((k,n), dtype=np.float32)  # k x n
X = X.reshape((n, 2, 2), order="F")
#print(X.flags)
X[0, :, 1] = np.array([2]*2, dtype=np.float32)
#print(X.flags)
#X = X.reshape((k, n))
X = X.transpose((1, 2, 0))
#print(X.flags)
X = X.reshape((k, n), order="C")

Z = np.ones((m,n), dtype=np.float32)  # m x n
W_gpu = gpuarray.to_gpu(W)
X_gpu = gpuarray.to_gpu(X)
Z_gpu = gpuarray.to_gpu(Z)


#print(X.flags)

print("W")
print(W_gpu)
print("X")
print(X_gpu)

cublas.cublasSgemm(cublas_handle, 
                   cublas._CUBLAS_OP['T'],
                   cublas._CUBLAS_OP['T'],
                   m, n, k,
                   alpha,
                   W_gpu.gpudata, k,
                   X_gpu.gpudata, n,
                   beta,
                   Z_gpu.gpudata, m)

print("Z")
print(Z_gpu)
print("Check")
#print(np.dot(W, X.reshape((k,n))))
print(np.dot(W, X))

In [None]:
X = np.ones((k,n), dtype=np.float32)  # k x n
X = X.reshape((n, 2, 2))
X[0, :, 1] = np.array([2]*2)

print("Input")
print(X)

#X = X.reshape((k, n))
X = X.transpose((1, 2, 0))
X = X.reshape((k, n))

print("Reshaped")
print(X)

In [None]:
# Play with dimensions, again...
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
import numpy as np
import skcuda.cublas as cublas


cublas_handle = cublas.cublasCreate()

alpha = 1.0
beta = 0.0

m = 1
k = 4
n = 2

W = np.array([[1, 2, 3, 4]], dtype=np.float32)  # m x k
#X = np.ones((k,n), dtype=np.float32)  # k x n

X = np.array([[1., 1.], [2., 1.], [1., 1.], [2., 1.]], dtype=np.float32)

Z = np.ones((m,n), dtype=np.float32)  # m x n
W_gpu = gpuarray.to_gpu(W)
X_gpu = gpuarray.to_gpu(X)
Z_gpu = gpuarray.to_gpu(Z)

print(X_gpu.shape)

print("W")
print(W_gpu)
print("X")
print(X_gpu)

cublas.cublasSgemm(cublas_handle, 
                   cublas._CUBLAS_OP['T'],
                   cublas._CUBLAS_OP['T'],
                   m, n, k,
                   alpha,
                   W_gpu.gpudata, k,
                   X_gpu.gpudata, n,
                   beta,
                   Z_gpu.gpudata, m)

print("Z")
print(Z_gpu)
print("Check")
#print(np.dot(W, X.reshape((k,n))))
print(np.dot(W, X))

In [None]:
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
import numpy as np
import skcuda.cublas as cublas

m = 1
k = 4
n = 2

# X1
X = np.ones((k,n), dtype=np.float32)  # k x n
X = X.reshape((n, 2, 2))
X[0, :, 1] = np.array([2]*2, dtype=np.float32)
X = X.transpose((1, 2, 0))
X = X.reshape((k, n))
X_gpu = gpuarray.to_gpu(X)

# X2
X2 = np.array([[1., 1.], [2., 1.], [1., 1.], [2., 1.]], dtype=np.float32)
X2_gpu = gpuarray.to_gpu(X2)


In [None]:
print((X.shape, X2.shape))
print((X_gpu.shape, X2_gpu.shape))

In [None]:
X.flags

In [None]:
X_gpu

In [None]:
X_gpu[0][1] = 0
X_gpu