Skip to content

Commit

Permalink
Merge pull request #215 from pypr/cuda-working
Browse files Browse the repository at this point in the history
Preliminary CUDA support
  • Loading branch information
prabhuramachandran committed May 27, 2019
2 parents f7f46ea + 3e23a40 commit b0fd689
Show file tree
Hide file tree
Showing 17 changed files with 776 additions and 577 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ install:
- conda config --add channels conda-forge
- conda config --add channels defaults
- conda install -c conda-forge pocl==1.2 pyopencl
- conda install -c defaults virtualenv
- conda install -c defaults virtualenv cython
- python -c 'import pyopencl as cl'
- pip install beaker tox tox-travis

Expand Down
147 changes: 3 additions & 144 deletions pysph/base/device_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,85 +17,6 @@
logger = logging.getLogger()


minmax_tpl = """//CL//
${dtype} mmc_neutral()
{
${dtype} result;
% for prop in prop_names:
% if not only_max:
result.cur_min_${prop} = INFINITY;
% endif
% if not only_min:
result.cur_max_${prop} = -INFINITY;
% endif
% endfor
return result;
}
${dtype} mmc_from_scalar(${args})
{
${dtype} result;
% for prop in prop_names:
% if not only_max:
result.cur_min_${prop} = ${prop};
% endif
% if not only_min:
result.cur_max_${prop} = ${prop};
% endif
% endfor
return result;
}
${dtype} agg_mmc(${dtype} a, ${dtype} b)
{
${dtype} result = a;
% for prop in prop_names:
% if not only_max:
if (b.cur_min_${prop} < result.cur_min_${prop})
result.cur_min_${prop} = b.cur_min_${prop};
% endif
% if not only_min:
if (b.cur_max_${prop} > result.cur_max_${prop})
result.cur_max_${prop} = b.cur_max_${prop};
% endif
% endfor
return result;
}
"""


def minmax_collector_key(device, dtype, props, name, *args):
return (device, dtype, tuple(props), name)


@memoize(key=minmax_collector_key)
def make_collector_dtype(device, dtype, props, name, only_min, only_max):
fields = [("pad", np.int32)]

for prop in props:
if not only_min:
fields.append(("cur_max_%s" % prop, dtype))
if not only_max:
fields.append(("cur_min_%s" % prop, dtype))

custom_dtype = np.dtype(fields)

from pyopencl.tools import get_or_register_dtype, match_dtype_to_c_struct

custom_dtype, c_decl = match_dtype_to_c_struct(device, name, custom_dtype)
custom_dtype = get_or_register_dtype(name, custom_dtype)

return custom_dtype, c_decl


class ExtractParticles(Template):
def __init__(self, name, prop_names):
super(ExtractParticles, self).__init__(name=name)
Expand Down Expand Up @@ -237,73 +158,11 @@ def max(self, arg):
return float(array.maximum(getattr(self, arg),
backend=self.backend))

@memoize(key=lambda *args: (args[-2], args[-1]))
def _get_minmax_kernel(self, ctx, dtype, mmc_dtype, prop_names,
only_min, only_max, name, mmc_c_decl):
tpl_args = ", ".join(
["%(dtype)s %(prop)s" % {'dtype': dtype, 'prop': prop}
for prop in prop_names]
)

mmc_preamble = mmc_c_decl + minmax_tpl
preamble = mkt.Template(text=mmc_preamble).render(
args=tpl_args, prop_names=prop_names, dtype=name,
only_min=only_min, only_max=only_max
)

knl_args = ", ".join(
["__global %(dtype)s *%(prop)s" % {'dtype': dtype, 'prop': prop}
for prop in prop_names]
)

map_args = ", ".join(
["%(prop)s[i]" % {'dtype': dtype, 'prop': prop}
for prop in prop_names]
)

from pyopencl.reduction import ReductionKernel

knl = ReductionKernel(
ctx, mmc_dtype, neutral="mmc_neutral()",
reduce_expr="agg_mmc(a, b)",
map_expr="mmc_from_scalar(%s)" % map_args,
arguments=knl_args,
preamble=preamble
)

return knl

def update_minmax_cl(self, props, only_min=False, only_max=False):
if self.backend != 'opencl':
raise ValueError('''Optimized minmax update only supported
using opencl backend''')
if only_min and only_max:
raise ValueError("Only one of only_min and only_max can be True")

dtype = 'double' if self.use_double else 'float'
op = 'min' if not only_max else ''
op += 'max' if not only_min else ''
name = "%s_collector_%s" % (op, ''.join([dtype] + props))

from compyle.opencl import get_context
ctx = get_context()
ary_list = [getattr(self, prop) for prop in props]
array.update_minmax_gpu(ary_list, only_min=only_min,
only_max=only_max, backend=self.backend)

mmc_dtype, mmc_c_decl = make_collector_dtype(ctx.devices[0],
self._dtype, props, name,
only_min, only_max)
knl = self._get_minmax_kernel(ctx, dtype, mmc_dtype, props,
only_min, only_max, name, mmc_c_decl)

args = [getattr(self, prop).dev for prop in props]

result = knl(*args).get()

for prop in props:
proparr = self._data[prop]
if not only_max:
proparr.minimum = result["cur_min_%s" % prop]
if not only_min:
proparr.maximum = result["cur_max_%s" % prop]

def update_min_max(self, props=None):
"""Update the min,max values of all properties """
Expand Down
2 changes: 0 additions & 2 deletions pysph/base/gpu_helper_functions.mako
Original file line number Diff line number Diff line change
Expand Up @@ -143,5 +143,3 @@
}
</%def>


125 changes: 125 additions & 0 deletions pysph/base/gpu_helper_kernels.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
from compyle.api import annotate, Elementwise
from compyle.parallel import Scan
from math import floor
from pytools import memoize


@memoize(key=lambda *args: tuple(args))
def get_elwise(f, backend):
return Elementwise(f, backend=backend)


@memoize(key=lambda *args: tuple(args))
def get_scan(inp_f, out_f, dtype, backend):
return Scan(input=inp_f, output=out_f, dtype=dtype,
backend=backend)


@annotate
def exclusive_input(i, ary):
return ary[i]


@annotate
def exclusive_output(i, prev_item, ary):
ary[i] = prev_item


@annotate
def norm2(x, y, z):
return x * x + y * y + z * z


@annotate
def find_cell_id(x, y, z, h, c):
c[0] = floor((x) / h)
c[1] = floor((y) / h)
c[2] = floor((z) / h)


@annotate(p='ulong', return_='ulong')
def interleave1(p):
return p


@annotate(ulong='p, q', return_='ulong')
def interleave2(p, q):
p = p & 0xffffffff
p = (p | (p << 16)) & 0x0000ffff0000ffff
p = (p | (p << 8)) & 0x00ff00ff00ff00ff
p = (p | (p << 4)) & 0x0f0f0f0f0f0f0f0f
p = (p | (p << 2)) & 0x3333333333333333
p = (p | (p << 1)) & 0x5555555555555555

q = q & 0xffffffff
q = (q | (q << 16)) & 0x0000ffff0000ffff
q = (q | (q << 8)) & 0x00ff00ff00ff00ff
q = (q | (q << 4)) & 0x0f0f0f0f0f0f0f0f
q = (q | (q << 2)) & 0x3333333333333333
q = (q | (q << 1)) & 0x5555555555555555

return (p | (q << 1))


@annotate(ulong='p, q, r', return_='ulong')
def interleave3(p, q, r):
p = (p | (p << 32)) & 0x1f00000000ffff
p = (p | (p << 16)) & 0x1f0000ff0000ff
p = (p | (p << 8)) & 0x100f00f00f00f00f
p = (p | (p << 4)) & 0x10c30c30c30c30c3
p = (p | (p << 2)) & 0x1249249249249249

q = (q | (q << 32)) & 0x1f00000000ffff
q = (q | (q << 16)) & 0x1f0000ff0000ff
q = (q | (q << 8)) & 0x100f00f00f00f00f
q = (q | (q << 4)) & 0x10c30c30c30c30c3
q = (q | (q << 2)) & 0x1249249249249249

r = (r | (r << 32)) & 0x1f00000000ffff
r = (r | (r << 16)) & 0x1f0000ff0000ff
r = (r | (r << 8)) & 0x100f00f00f00f00f
r = (r | (r << 4)) & 0x10c30c30c30c30c3
r = (r | (r << 2)) & 0x1249249249249249

return (p | (q << 1) | (r << 2))


@annotate
def find_idx(keys, num_particles, key):
first = 0
last = num_particles - 1
middle = (first + last) / 2

while first <= last:
if keys[middle] < key:
first = middle + 1
elif keys[middle] > key:
last = middle - 1
elif keys[middle] == key:
if middle == 0:
return 0
if keys[middle - 1] != key:
return middle
else:
last = middle - 1
middle = (first + last) / 2

return -1


@annotate
def neighbor_boxes(c_x, c_y, c_z, nbr_boxes):
nbr_boxes_length = 1

nbr_boxes[0] = interleave3(c_x, c_y, c_z)

key = declare('ulong')
for j in range(-1, 2):
for k in range(-1, 2):
for m in range(-1, 2):
if (j != 0 or k != 0 or m != 0) and c_x + m >= 0 and c_y + k >= 0 and c_z + j >= 0:
key = interleave3(c_x + m, c_y + k, c_z + j)
nbr_boxes[nbr_boxes_length] = key
nbr_boxes_length += 1

return nbr_boxes_length
20 changes: 11 additions & 9 deletions pysph/base/gpu_nnps_base.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ cimport cython

import pyopencl as cl
import pyopencl.array
from pyopencl.scan import ExclusiveScanKernel
from pyopencl.elementwise import ElementwiseKernel

from pysph.base.nnps_base cimport *
Expand All @@ -45,11 +44,14 @@ from utils import ParticleTAGS

from nnps_base cimport *

from pysph.base.gpu_helper_kernels import (exclusive_input, exclusive_output,
get_scan)


cdef class GPUNeighborCache:
def __init__(self, GPUNNPS nnps, int dst_index, int src_index,
backend='opencl'):
self.backend = backend
backend=None):
self.backend = get_backend(backend)
self._dst_index = dst_index
self._src_index = src_index
self._nnps = nnps
Expand Down Expand Up @@ -84,6 +86,7 @@ cdef class GPUNeighborCache:
#### Private protocol ################################################

cdef void _find_neighbors(self):

self._nnps.find_neighbor_lengths(self._nbr_lengths_gpu)
# FIXME:
# - Store sum kernel
Expand All @@ -101,11 +104,10 @@ cdef class GPUNeighborCache:

# Do prefix sum on self._neighbor_lengths for the self._start_idx
if self._get_start_indices is None:
self._get_start_indices = ExclusiveScanKernel(
get_context(), np.uint32, scan_expr="a+b", neutral="0"
)
self._get_start_indices = get_scan(exclusive_input, exclusive_output,
dtype=np.uint32, backend=self.backend)

self._get_start_indices(self._start_idx_gpu.dev)
self._get_start_indices(ary=self._start_idx_gpu)

self._nnps.find_nearest_neighbors_gpu(self._neighbors_gpu,
self._start_idx_gpu)
Expand Down Expand Up @@ -144,7 +146,7 @@ cdef class GPUNNPS(NNPSBase):
"""
def __init__(self, int dim, list particles, double radius_scale=2.0,
int ghost_layers=1, domain=None, bint cache=True,
bint sort_gids=False, backend='opencl'):
bint sort_gids=False, backend=None):
"""Constructor for NNPS
Parameters
Expand Down Expand Up @@ -178,8 +180,8 @@ cdef class GPUNNPS(NNPSBase):
domain, cache, sort_gids)

self.backend = get_backend(backend)
self.backend = 'opencl' if self.backend is 'cython' else self.backend
self.use_double = get_config().use_double
self.queue = get_queue()
self.dtype = np.float64 if self.use_double else np.float32
self.dtype_max = np.finfo(self.dtype).max
self._last_domain_size = 0.0
Expand Down
5 changes: 3 additions & 2 deletions pysph/base/octree_gpu_nnps.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@ cdef class OctreeGPUNNPS(GPUNNPS):
int ghost_layers=1, domain=None, bint fixed_h=False,
bint cache=True, bint sort_gids=False,
allow_sort=False, leaf_size=32,
bint use_elementwise=False, bint use_partitions=False):
bint use_elementwise=False, bint use_partitions=False,
backend=None):
GPUNNPS.__init__(
self, dim, particles, radius_scale, ghost_layers, domain,
cache, sort_gids, backend='opencl'
cache, sort_gids, backend=backend
)

self.src_index = -1
Expand Down

0 comments on commit b0fd689

Please sign in to comment.