In [1]:
%reset -f
import numpy as np
import pycuda.autoinit
import pycuda.driver as drv
import pycuda.gpuarray as gpuarray
from pycuda.sparse.packeted import PacketedSpMV
from pycuda.tools import DeviceMemoryPool
from scipy.sparse import csr_matrix



# Make sure we can detect GPUArray instances

In [2]:
x = np.arange(5)
d_x = gpuarray.to_gpu(x)
print(type(d_x))
print(d_x.__class__)
print(d_x.__class__.__name__)

<class 'pycuda.gpuarray.GPUArray'>
<class 'pycuda.gpuarray.GPUArray'>
GPUArray


# Make sure spmv is storing csr matrix using gpu arrays.

## Initialize arrays

In [3]:
dtype = np.float32

a_dense = np.array([
    [0, 0, 0, 1, 0, 0, 0],
    [0, 2, 0, 0, 0, 0, 0],
    [0, 0, 4, 0, 0, 0, 3],
    [0, 0, 0, 0, 0, 0, 0],
    [0, 0, 0, 5, 0, 0, 0],
    [1, 0, 0, 2, 4, 0, 0],
    [0, 3, 0, 0, 0, 0, 0],
], dtype=dtype)
a_sparse = csr_matrix(a_dense)

a_data = a_sparse.data
a_row_ptrs = a_sparse.indptr
a_col_idxs = a_sparse.indices

b = np.array([[1, 2, 3, 4, 5, 6, 7]], dtype=dtype).T
c_correct = np.array([[4, 4, 33, 0, 20, 29, 6]], dtype=dtype).T

m = a_sparse.shape[0]
n = 1
k = a_sparse.shape[1]

## Create PacketedSpMV instance

In [4]:
spmv = PacketedSpMV(a_sparse, is_symmetric=False, dtype=dtype)

In [5]:
for attr in dir(spmv):
    val = getattr(spmv, attr)
    print('{} ({})'.format(attr, type(val).__name__))

__call__ (method)
__class__ (type)
__delattr__ (method-wrapper)
__dict__ (dict)
__dir__ (builtin_function_or_method)
__doc__ (NoneType)
__eq__ (method-wrapper)
__format__ (builtin_function_or_method)
__ge__ (method-wrapper)
__getattribute__ (method-wrapper)
__gt__ (method-wrapper)
__hash__ (method-wrapper)
__init__ (method)
__le__ (method-wrapper)
__lt__ (method-wrapper)
__module__ (str)
__ne__ (method-wrapper)
__new__ (builtin_function_or_method)
__reduce__ (builtin_function_or_method)
__reduce_ex__ (builtin_function_or_method)
__repr__ (method-wrapper)
__setattr__ (method-wrapper)
__sizeof__ (builtin_function_or_method)
__str__ (method-wrapper)
__subclasshook__ (builtin_function_or_method)
__weakref__ (NoneType)
block_count (int)
build_gpu_data_structure (method)
data_array (GPUArray)
dtype (dtype)
find_local_row_costs_and_remaining_coo (method)
find_simple_index_stuff (method)
find_thread_assignment (method)
get_kernel (method)
index_array (GPUArray)
index_dtype (type)
new2old_fetch

In [6]:
for attr in dir(spmv):
    val = getattr(spmv, attr)
    if type(val).__name__.endswith('GPUArray'):
        print('{} ({}) = {}'.format(attr, type(val).__name__, val))

data_array (GPUArray) = [ 1.  4.  3. ...,  0.  0.  0.]
index_array (GPUArray) = [327680 131074 393217 ...,      0      0      0]
new2old_fetch_indices (GPUArray) = [0 1 2 3 4 5 6]
old2new_fetch_indices (GPUArray) = [0 1 2 3 4 5 6]
packet_base_rows (GPUArray) = [0 7]
thread_ends (GPUArray) = [3072 2049 1026 ..., 1021 1022 1023]
thread_starts (GPUArray) = [   0    1    2 ..., 1021 1022 1023]


# Make sure we can correctly calculate the product

## Move b to initialize c on GPU

In [7]:
dev_pool = DeviceMemoryPool()

d_b = gpuarray.to_gpu(b, dev_pool.allocate)
d_c = gpuarray.zeros(m, dtype=dtype, allocator=d_b.allocator)

## Calculate the matrix product

In [8]:
d_c = spmv(d_b, d_c)

print('C computed: {}'.format(d_c.get().flatten()))
print('C correct: {}'.format(c_correct.flatten()))

C computed: [  4.   4.  33.   0.  20.  29.   6.]
C correct: [  4.   4.  33.   0.  20.  29.   6.]


# Run benchmarks

In [None]:
%reset -f
import numpy as np
import pycuda.autoinit
import pycuda.driver as drv
import pycuda.gpuarray as gpuarray
from pycuda.sparse.packeted import PacketedSpMV
from pycuda.tools import DeviceMemoryPool
from scipy.sparse import csr_matrix
from time import time


def spmv_cuda(a_sparse, b, count):
    
    dtype = a_sparse.dtype
    m = a_sparse.shape[0]
    
    print('moving objects to GPU...')
    
    spmv = PacketedSpMV(a_sparse, is_symmetric=False, dtype=dtype)
    
    dev_pool = DeviceMemoryPool()
    d_b = gpuarray.to_gpu(b, dev_pool.allocate)
    d_c = gpuarray.zeros(m, dtype=dtype, allocator=d_b.allocator)
    
    print('Executing spmv operation...')
    
    tic = time()
    for ii in range(count):
        d_c = spmv(d_b, d_c)
    toc = time()
    
    return d_c.get(), toc - tic

# test
dtype_test = np.float32

a_dense_test = np.array([
    [0, 0, 0, 1, 0, 0, 0],
    [0, 2, 0, 0, 0, 0, 0],
    [0, 0, 4, 0, 0, 0, 3],
    [0, 0, 0, 0, 0, 0, 0],
    [0, 0, 0, 5, 0, 0, 0],
    [1, 0, 0, 2, 4, 0, 0],
    [0, 3, 0, 0, 0, 0, 0],
], dtype=dtype_test)
a_sparse_test = csr_matrix(a_dense_test)

b_test = np.array([[1, 2, 3, 4, 5, 6, 7]], dtype=dtype_test).T
c_correct_test = np.array([[4, 4, 33, 0, 20, 29, 6]], dtype=dtype_test).T

c_test = spmv_cuda(a_sparse_test, b_test, 1)

print('C computed: {}'.format(c_test))
print('C correct: {}'.format(c_correct_test))

In [None]:
COUNT = 1
N = 5000
P = 0.01
DTYPE = np.float32

print('Constructing objects...\n\n')
np.random.seed(0)
a_dense = np.random.rand(N, N).astype(DTYPE)
a_dense[np.random.rand(N, N) >= P] = 0
a_sparse = csr_matrix(a_dense)

b = np.random.rand(N, 1).astype(DTYPE)

# scipy sparse
print('Testing scipy sparse matrix multiplication...\n')
tic = time()
for ii in range(COUNT):
    c = b.copy()
    c = a_sparse.dot(c)
toc = time()

print('c = {}'.format(c[:5, 0]))
print('scipy sparse matrix multiplication took {} seconds\n\n'.format(toc - tic))

# pycuda sparse
print('Testing pycuda sparse matrix multiplication...\n')
c, t = spmv_cuda(a_sparse, b, COUNT)
print('c = {}'.format(c[:5, 0]))
print('pycuda sparse matrix multiplication took {} seconds\n\n'.format(t))