Merge pull request #215 from pypr/cuda-working

Preliminary CUDA support
pypr · May 27, 2019 · b0fd689 · b0fd689
2 parents f7f46ea + 3e23a40
commit b0fd689
Show file tree

Hide file tree

Showing 17 changed files with 776 additions and 577 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -25,7 +25,7 @@ install:
     - conda config --add channels conda-forge
     - conda config --add channels defaults
     - conda install -c conda-forge pocl==1.2 pyopencl
-    - conda install -c defaults virtualenv
+    - conda install -c defaults virtualenv cython
     - python -c 'import pyopencl as cl'
     - pip install beaker tox tox-travis
 

diff --git a/pysph/base/device_helper.py b/pysph/base/device_helper.py
@@ -17,85 +17,6 @@
 logger = logging.getLogger()
 
 
-minmax_tpl = """//CL//
-
-    ${dtype} mmc_neutral()
-    {
-        ${dtype} result;
-
-        % for prop in prop_names:
-        % if not only_max:
-        result.cur_min_${prop} = INFINITY;
-        % endif
-        % if not only_min:
-        result.cur_max_${prop} = -INFINITY;
-        % endif
-        % endfor
-
-        return result;
-    }
-
-    ${dtype} mmc_from_scalar(${args})
-    {
-        ${dtype} result;
-
-        % for prop in prop_names:
-        % if not only_max:
-        result.cur_min_${prop} = ${prop};
-        % endif
-        % if not only_min:
-        result.cur_max_${prop} = ${prop};
-        % endif
-        % endfor
-
-        return result;
-    }
-
-    ${dtype} agg_mmc(${dtype} a, ${dtype} b)
-    {
-        ${dtype} result = a;
-
-        % for prop in prop_names:
-        % if not only_max:
-        if (b.cur_min_${prop} < result.cur_min_${prop})
-            result.cur_min_${prop} = b.cur_min_${prop};
-        % endif
-        % if not only_min:
-        if (b.cur_max_${prop} > result.cur_max_${prop})
-            result.cur_max_${prop} = b.cur_max_${prop};
-        % endif
-        % endfor
-
-        return result;
-    }
-
-    """
-
-
-def minmax_collector_key(device, dtype, props, name, *args):
-    return (device, dtype, tuple(props), name)
-
-
-@memoize(key=minmax_collector_key)
-def make_collector_dtype(device, dtype, props, name, only_min, only_max):
-    fields = [("pad", np.int32)]
-
-    for prop in props:
-        if not only_min:
-            fields.append(("cur_max_%s" % prop, dtype))
-        if not only_max:
-            fields.append(("cur_min_%s" % prop, dtype))
-
-    custom_dtype = np.dtype(fields)
-
-    from pyopencl.tools import get_or_register_dtype, match_dtype_to_c_struct
-
-    custom_dtype, c_decl = match_dtype_to_c_struct(device, name, custom_dtype)
-    custom_dtype = get_or_register_dtype(name, custom_dtype)
-
-    return custom_dtype, c_decl
-
-
 class ExtractParticles(Template):
     def __init__(self, name, prop_names):
         super(ExtractParticles, self).__init__(name=name)
@@ -237,73 +158,11 @@ def max(self, arg):
         return float(array.maximum(getattr(self, arg),
                                    backend=self.backend))
 
-    @memoize(key=lambda *args: (args[-2], args[-1]))
-    def _get_minmax_kernel(self, ctx, dtype, mmc_dtype, prop_names,
-                           only_min, only_max, name, mmc_c_decl):
-        tpl_args = ", ".join(
-            ["%(dtype)s %(prop)s" % {'dtype': dtype, 'prop': prop}
-                for prop in prop_names]
-        )
-
-        mmc_preamble = mmc_c_decl + minmax_tpl
-        preamble = mkt.Template(text=mmc_preamble).render(
-            args=tpl_args, prop_names=prop_names, dtype=name,
-            only_min=only_min, only_max=only_max
-        )
-
-        knl_args = ", ".join(
-            ["__global %(dtype)s *%(prop)s" % {'dtype': dtype, 'prop': prop}
-                for prop in prop_names]
-        )
-
-        map_args = ", ".join(
-            ["%(prop)s[i]" % {'dtype': dtype, 'prop': prop}
-                for prop in prop_names]
-        )
-
-        from pyopencl.reduction import ReductionKernel
-
-        knl = ReductionKernel(
-            ctx, mmc_dtype, neutral="mmc_neutral()",
-            reduce_expr="agg_mmc(a, b)",
-            map_expr="mmc_from_scalar(%s)" % map_args,
-            arguments=knl_args,
-            preamble=preamble
-        )
-
-        return knl
-
     def update_minmax_cl(self, props, only_min=False, only_max=False):
-        if self.backend != 'opencl':
-            raise ValueError('''Optimized minmax update only supported
-                             using opencl backend''')
-        if only_min and only_max:
-            raise ValueError("Only one of only_min and only_max can be True")
-
-        dtype = 'double' if self.use_double else 'float'
-        op = 'min' if not only_max else ''
-        op += 'max' if not only_min else ''
-        name = "%s_collector_%s" % (op, ''.join([dtype] + props))
-
-        from compyle.opencl import get_context
-        ctx = get_context()
+        ary_list = [getattr(self, prop) for prop in props]
+        array.update_minmax_gpu(ary_list, only_min=only_min,
+                               only_max=only_max, backend=self.backend)
 
-        mmc_dtype, mmc_c_decl = make_collector_dtype(ctx.devices[0],
-                                                     self._dtype, props, name,
-                                                     only_min, only_max)
-        knl = self._get_minmax_kernel(ctx, dtype, mmc_dtype, props,
-                                      only_min, only_max, name, mmc_c_decl)
-
-        args = [getattr(self, prop).dev for prop in props]
-
-        result = knl(*args).get()
-
-        for prop in props:
-            proparr = self._data[prop]
-            if not only_max:
-                proparr.minimum = result["cur_min_%s" % prop]
-            if not only_min:
-                proparr.maximum = result["cur_max_%s" % prop]
 
     def update_min_max(self, props=None):
         """Update the min,max values of all properties """

diff --git a/pysph/base/gpu_helper_functions.mako b/pysph/base/gpu_helper_functions.mako
@@ -143,5 +143,3 @@
     }
 
 </%def>
-
-
diff --git a/pysph/base/gpu_helper_kernels.py b/pysph/base/gpu_helper_kernels.py
@@ -0,0 +1,125 @@
+from compyle.api import annotate, Elementwise
+from compyle.parallel import Scan
+from math import floor
+from pytools import memoize
+
+
+@memoize(key=lambda *args: tuple(args))
+def get_elwise(f, backend):
+    return Elementwise(f, backend=backend)
+
+
+@memoize(key=lambda *args: tuple(args))
+def get_scan(inp_f, out_f, dtype, backend):
+    return Scan(input=inp_f, output=out_f, dtype=dtype,
+                backend=backend)
+
+
+@annotate
+def exclusive_input(i, ary):
+    return ary[i]
+
+
+@annotate
+def exclusive_output(i, prev_item, ary):
+    ary[i] = prev_item
+
+
+@annotate
+def norm2(x, y, z):
+    return x * x + y * y + z * z
+
+
+@annotate
+def find_cell_id(x, y, z, h, c):
+    c[0] = floor((x) / h)
+    c[1] = floor((y) / h)
+    c[2] = floor((z) / h)
+
+
+@annotate(p='ulong', return_='ulong')
+def interleave1(p):
+    return p
+
+
+@annotate(ulong='p, q', return_='ulong')
+def interleave2(p, q):
+    p = p & 0xffffffff
+    p = (p | (p << 16)) & 0x0000ffff0000ffff
+    p = (p | (p << 8)) & 0x00ff00ff00ff00ff
+    p = (p | (p << 4)) & 0x0f0f0f0f0f0f0f0f
+    p = (p | (p << 2)) & 0x3333333333333333
+    p = (p | (p << 1)) & 0x5555555555555555
+
+    q = q & 0xffffffff
+    q = (q | (q << 16)) & 0x0000ffff0000ffff
+    q = (q | (q << 8)) & 0x00ff00ff00ff00ff
+    q = (q | (q << 4)) & 0x0f0f0f0f0f0f0f0f
+    q = (q | (q << 2)) & 0x3333333333333333
+    q = (q | (q << 1)) & 0x5555555555555555
+
+    return (p | (q << 1))
+
+
+@annotate(ulong='p, q, r', return_='ulong')
+def interleave3(p, q, r):
+    p = (p | (p << 32)) & 0x1f00000000ffff
+    p = (p | (p << 16)) & 0x1f0000ff0000ff
+    p = (p | (p << 8)) & 0x100f00f00f00f00f
+    p = (p | (p << 4)) & 0x10c30c30c30c30c3
+    p = (p | (p << 2)) & 0x1249249249249249
+
+    q = (q | (q << 32)) & 0x1f00000000ffff
+    q = (q | (q << 16)) & 0x1f0000ff0000ff
+    q = (q | (q << 8)) & 0x100f00f00f00f00f
+    q = (q | (q << 4)) & 0x10c30c30c30c30c3
+    q = (q | (q << 2)) & 0x1249249249249249
+
+    r = (r | (r << 32)) & 0x1f00000000ffff
+    r = (r | (r << 16)) & 0x1f0000ff0000ff
+    r = (r | (r << 8)) & 0x100f00f00f00f00f
+    r = (r | (r << 4)) & 0x10c30c30c30c30c3
+    r = (r | (r << 2)) & 0x1249249249249249
+
+    return (p | (q << 1) | (r << 2))
+
+
+@annotate
+def find_idx(keys, num_particles, key):
+    first = 0
+    last = num_particles - 1
+    middle = (first + last) / 2
+
+    while first <= last:
+        if keys[middle] < key:
+            first = middle + 1
+        elif keys[middle] > key:
+            last = middle - 1
+        elif keys[middle] == key:
+            if middle == 0:
+                return 0
+            if keys[middle - 1] != key:
+                return middle
+            else:
+                last = middle - 1
+        middle = (first + last) / 2
+
+    return -1
+
+
+@annotate
+def neighbor_boxes(c_x, c_y, c_z, nbr_boxes):
+    nbr_boxes_length = 1
+
+    nbr_boxes[0] = interleave3(c_x, c_y, c_z)
+
+    key = declare('ulong')
+    for j in range(-1, 2):
+        for k in range(-1, 2):
+            for m in range(-1, 2):
+                if (j != 0 or k != 0 or m != 0) and c_x + m >= 0 and c_y + k >= 0 and c_z + j >= 0:
+                    key = interleave3(c_x + m, c_y + k, c_z + j)
+                    nbr_boxes[nbr_boxes_length] = key
+                    nbr_boxes_length += 1
+
+    return nbr_boxes_length
diff --git a/pysph/base/gpu_nnps_base.pyx b/pysph/base/gpu_nnps_base.pyx
@@ -26,7 +26,6 @@ cimport cython
 
 import pyopencl as cl
 import pyopencl.array
-from pyopencl.scan import ExclusiveScanKernel
 from pyopencl.elementwise import ElementwiseKernel
 
 from pysph.base.nnps_base cimport *
@@ -45,11 +44,14 @@ from utils import ParticleTAGS
 
 from nnps_base cimport *
 
+from pysph.base.gpu_helper_kernels import (exclusive_input, exclusive_output,
+                                           get_scan)
+
 
 cdef class GPUNeighborCache:
     def __init__(self, GPUNNPS nnps, int dst_index, int src_index,
-            backend='opencl'):
-        self.backend = backend
+            backend=None):
+        self.backend = get_backend(backend)
         self._dst_index = dst_index
         self._src_index = src_index
         self._nnps = nnps
@@ -84,6 +86,7 @@ cdef class GPUNeighborCache:
     #### Private protocol ################################################
 
     cdef void _find_neighbors(self):
+
         self._nnps.find_neighbor_lengths(self._nbr_lengths_gpu)
         # FIXME:
         # - Store sum kernel
@@ -101,11 +104,10 @@ cdef class GPUNeighborCache:
 
         # Do prefix sum on self._neighbor_lengths for the self._start_idx
         if self._get_start_indices is None:
-            self._get_start_indices = ExclusiveScanKernel(
-                get_context(), np.uint32, scan_expr="a+b", neutral="0"
-            )
+            self._get_start_indices = get_scan(exclusive_input, exclusive_output,
+                    dtype=np.uint32, backend=self.backend)
 
-        self._get_start_indices(self._start_idx_gpu.dev)
+        self._get_start_indices(ary=self._start_idx_gpu)
 
         self._nnps.find_nearest_neighbors_gpu(self._neighbors_gpu,
                 self._start_idx_gpu)
@@ -144,7 +146,7 @@ cdef class GPUNNPS(NNPSBase):
     """
     def __init__(self, int dim, list particles, double radius_scale=2.0,
                  int ghost_layers=1, domain=None, bint cache=True,
-                 bint sort_gids=False, backend='opencl'):
+                 bint sort_gids=False, backend=None):
         """Constructor for NNPS
 
         Parameters
@@ -178,8 +180,8 @@ cdef class GPUNNPS(NNPSBase):
                 domain, cache, sort_gids)
 
         self.backend = get_backend(backend)
+        self.backend = 'opencl' if self.backend is 'cython' else self.backend
         self.use_double = get_config().use_double
-        self.queue = get_queue()
         self.dtype = np.float64 if self.use_double else np.float32
         self.dtype_max = np.finfo(self.dtype).max
         self._last_domain_size = 0.0

diff --git a/pysph/base/octree_gpu_nnps.pyx b/pysph/base/octree_gpu_nnps.pyx
@@ -8,10 +8,11 @@ cdef class OctreeGPUNNPS(GPUNNPS):
                  int ghost_layers=1, domain=None, bint fixed_h=False,
                  bint cache=True, bint sort_gids=False,
                  allow_sort=False, leaf_size=32,
-                 bint use_elementwise=False, bint use_partitions=False):
+                 bint use_elementwise=False, bint use_partitions=False,
+                 backend=None):
         GPUNNPS.__init__(
             self, dim, particles, radius_scale, ghost_layers, domain,
-            cache, sort_gids, backend='opencl'
+            cache, sort_gids, backend=backend
         )
 
         self.src_index = -1