opencl: finish implementation of level 1 transform

rjw57 · Sep 4, 2014 · f8da946 · f8da946
1 parent 11d67d9
commit f8da946
Show file tree

Hide file tree

Showing 3 changed files with 132 additions and 10 deletions.
diff --git a/dtcwt/opencl/q2c.cl b/dtcwt/opencl/q2c.cl
@@ -0,0 +1,75 @@
+#ifndef CHUNK_SIZE
+#   error CHUNK_SIZE must be defined
+#endif
+
+typedef float scalar_t;
+typedef float2 vec2_t;
+typedef float3 vec3_t;
+typedef float4 vec4_t;
+
+typedef float2 complex_t;
+
+__kernel
+void q2c(
+    __global vec4_t* input_ptr, int input_start, int2 input_strides, int2 input_shape,
+    __global scalar_t* low_ptr, int low_start, int2 low_strides, int2 low_shape,
+    __global complex_t* high_ptr, int high_start, int3 high_strides, int3 high_shape)
+{
+    input_ptr += input_start;
+    high_ptr += high_start;
+    low_ptr += low_start;
+
+    // Abort if we will read from invalid input
+    int2 input_coord = 2 * (int2)(get_global_id(0), get_global_id(1));
+    if(any(input_coord < 0) || any(input_coord >= input_shape-1)) {
+        return;
+    }
+
+    // Each output pixel corresponds to a 2x2 region of the input. Read each portion.
+    int2 input_offsets = mul24(input_coord, input_strides);
+    int input_offset = input_offsets.x + input_offsets.y;
+    vec4_t tl = input_ptr[input_offset];
+    vec4_t tr = input_ptr[input_offset + input_strides.y];
+    vec4_t bl = input_ptr[input_offset + input_strides.x];
+    vec4_t br = input_ptr[input_offset + input_strides.x + input_strides.y];
+
+    // Write lowpass output
+    int2 low_coord = 2*(int2)(get_global_id(0), get_global_id(1));
+    if(!any(low_coord < 0) && !any(low_coord >= low_shape-1)) {
+        int2 low_offsets = mul24(low_coord, low_strides);
+        int low_offset = low_offsets.x + low_offsets.y;
+        low_ptr[low_offset] = tl.x;
+        low_ptr[low_offset + low_strides.y] = tr.x;
+        low_ptr[low_offset + low_strides.x] = bl.x;
+        low_ptr[low_offset + low_strides.x + low_strides.y] = br.x;
+    }
+
+    // Write highpass output
+    int2 high_coord = (int2)(get_global_id(0), get_global_id(1));
+    if(!any(high_coord < 0) && !any(high_coord >= high_shape.xy)) {
+        int2 high_offsets = mul24(high_coord.xy, high_strides.xy);
+        int high_offset = high_offsets.x + high_offsets.y;
+
+        // p = (tl + j*tr) / sqrt(2)
+        // q = (br - j*bl) / sqrt(2)
+        // z1 = p - q, z2 = p + q
+        //
+        // => z1 = ((tl-br) + j*(tr+bl)) / sqrt2
+        // => z2 = ((tl+br) + j*(tr-bl)) / sqrt2
+
+        float sqrt_half = sqrt(0.5);
+        vec3_t z1_real = (tl.yzw-br.yzw) * sqrt_half;
+        vec3_t z1_imag = (tr.yzw+bl.yzw) * sqrt_half;
+        vec3_t z2_real = (tl.yzw+br.yzw) * sqrt_half;
+        vec3_t z2_imag = (tr.yzw-bl.yzw) * sqrt_half;
+
+        high_ptr[high_offset + 0*high_strides.z] = (complex_t)(z1_real.y, z1_imag.y);
+        high_ptr[high_offset + 1*high_strides.z] = (complex_t)(z1_real.z, z1_imag.z);
+        high_ptr[high_offset + 2*high_strides.z] = (complex_t)(z1_real.x, z1_imag.x);
+        high_ptr[high_offset + 3*high_strides.z] = (complex_t)(z2_real.x, z2_imag.x);
+        high_ptr[high_offset + 4*high_strides.z] = (complex_t)(z2_real.z, z2_imag.z);
+        high_ptr[high_offset + 5*high_strides.z] = (complex_t)(z2_real.y, z2_imag.y);
+    }
+}
+
+// vim:sw=4:sts=4:et
diff --git a/dtcwt/opencl/transform2d.py b/dtcwt/opencl/transform2d.py
@@ -4,6 +4,8 @@
 
 """
 import logging
+import os
+
 import numpy as np
 
 try:
@@ -15,6 +17,7 @@
 
 from dtcwt.numpy.common import Pyramid as NumPyPyramid
 from dtcwt.numpy.transform2d import DEFAULT_BIORT, DEFAULT_QSHIFT
+from dtcwt.opencl.util import array_to_spec, global_and_local_size, good_chunk_size_for_queue
 
 import dtcwt.opencl.convolve as convolve
 import dtcwt.opencl.coeffs as coeffs
@@ -88,6 +91,36 @@ def lowpass(self):
     def highpasses(self):
         return tuple(x.get() for x in self.cl_highpasses)
 
+class _Q2C(object):
+    PROGRAM_PATH = os.path.join(os.path.dirname(__file__), 'q2c.cl')
+    def __init__(self, queue):
+        self.queue = queue
+        self.chunk_size = good_chunk_size_for_queue(queue)
+        self.program = self._build_program()
+        self.q2c = self.program.q2c
+
+    def _build_program(self):
+        constants = { 'CHUNK_SIZE': self.chunk_size, }
+        options = list('-D{0}={1}'.format(k,v) for k,v in constants.items())
+        program = cl.Program(self.queue.context, open(_Q2C.PROGRAM_PATH).read())
+        program.build(options)
+        return program
+
+    def __call__(self, input_array, low_array, high_array, wait_for=None):
+        assert len(low_array.shape) == 2
+        assert len(high_array.shape) == 3 and high_array.shape[2] >= 6
+
+        in_data, in_offset, in_strides, in_shape = array_to_spec(input_array)
+        low_data, low_offset, low_strides, low_shape = array_to_spec(low_array)
+        high_data, high_offset, high_strides, high_shape = array_to_spec(high_array)
+        global_size, local_size = global_and_local_size(high_array.shape, self.chunk_size)
+
+        return self.q2c(self.queue, global_size, local_size,
+            in_data, in_offset, in_strides, in_shape,
+            low_data, low_offset, low_strides, low_shape,
+            high_data, high_offset, high_strides, high_shape,
+            wait_for=wait_for)
+
 class Transform2d(object):
     """OpenCL implementation of 2D DTCWT.
 
@@ -104,8 +137,10 @@ def __init__(self, biort=DEFAULT_BIORT, qshift=DEFAULT_QSHIFT, queue=None):
         self._queue = queue
         self._biort_coeffs = coeffs.biort(biort)
         self._input_dtype = np.float32
-        self._l1_convolution = convolve.Convolution1D(
-            self._queue, self._biort_coeffs, self._input_dtype)
+        self._l1_convolution = convolve.Convolution2D(self._queue, self._biort_coeffs)
+
+        # Prepare the q2c kernel
+        self._q2c = _Q2C(self._queue)
 
     def forward(self, X, nlevels=3, include_scale=False, wait_for=None):
         if include_scale:
@@ -147,10 +182,22 @@ def _level1_transform(self, X, wait_for):
         if np.any(np.asarray(X.shape[:2]) % 2 == 1):
             raise ValueError('Input must have even rows and columns')
 
-        # Do the first dimension convolution
-        lohi = cla.empty(self._queue, X.shape, dtype=cla.vec.float2)
-        evt = self._l1_convolution(X, lohi, wait_for=wait_for)
+        # Allocate workspace for convolution
+        ws_size = self._l1_convolution.workspace_size_for_input(X)
+        ws = cl.Buffer(self._queue.context, cl.mem_flags.READ_WRITE, ws_size)
+
+        # Create convolution output
+        output_array = cla.empty(self._queue, X.shape, self._l1_convolution.output_dtype)
+
+        # Do the convolution
+        evt = self._l1_convolution(X, output_array, ws)
+
+        # Create low and highpass output
+        half_shape = tuple(x>>1 for x in X.shape)
+        lowpass = cla.empty(self._queue, X.shape, np.float32)
+        highpass = cla.empty(self._queue, half_shape + (6,), np.complex64)
+
+        # Extract low- and highpass from convolution output
+        evt = self._q2c(output_array, lowpass, highpass, wait_for=[evt])
 
-        lp = cla.empty(self._queue, X.shape, dtype=np.float32)
-        hp = cla.empty(self._queue, X.shape + (6,), dtype=np.complex64)
-        return lp, hp, evt
+        return lowpass, highpass, evt
diff --git a/tests/testopenclxfm2.py b/tests/testopenclxfm2.py
@@ -46,5 +46,5 @@ def test_level1_forward_transform():
         # Perform opencl level 1 transform
         ocl = t.forward(plane_device, nlevels=1)
 
-        # TODO: Compare result
-        # assert_pyramids_almost_equal(gold, ocl)
+        # Compare result
+        assert_pyramids_almost_equal(gold, ocl)