Skip to content

Commit

Permalink
opencl: finish implementation of level 1 transform
Browse files Browse the repository at this point in the history
  • Loading branch information
rjw57 committed Sep 4, 2014
1 parent 11d67d9 commit f8da946
Show file tree
Hide file tree
Showing 3 changed files with 132 additions and 10 deletions.
75 changes: 75 additions & 0 deletions dtcwt/opencl/q2c.cl
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#ifndef CHUNK_SIZE
# error CHUNK_SIZE must be defined
#endif

typedef float scalar_t;
typedef float2 vec2_t;
typedef float3 vec3_t;
typedef float4 vec4_t;

typedef float2 complex_t;

__kernel
void q2c(
__global vec4_t* input_ptr, int input_start, int2 input_strides, int2 input_shape,
__global scalar_t* low_ptr, int low_start, int2 low_strides, int2 low_shape,
__global complex_t* high_ptr, int high_start, int3 high_strides, int3 high_shape)
{
input_ptr += input_start;
high_ptr += high_start;
low_ptr += low_start;

// Abort if we will read from invalid input
int2 input_coord = 2 * (int2)(get_global_id(0), get_global_id(1));
if(any(input_coord < 0) || any(input_coord >= input_shape-1)) {
return;
}

// Each output pixel corresponds to a 2x2 region of the input. Read each portion.
int2 input_offsets = mul24(input_coord, input_strides);
int input_offset = input_offsets.x + input_offsets.y;
vec4_t tl = input_ptr[input_offset];
vec4_t tr = input_ptr[input_offset + input_strides.y];
vec4_t bl = input_ptr[input_offset + input_strides.x];
vec4_t br = input_ptr[input_offset + input_strides.x + input_strides.y];

// Write lowpass output
int2 low_coord = 2*(int2)(get_global_id(0), get_global_id(1));
if(!any(low_coord < 0) && !any(low_coord >= low_shape-1)) {
int2 low_offsets = mul24(low_coord, low_strides);
int low_offset = low_offsets.x + low_offsets.y;
low_ptr[low_offset] = tl.x;
low_ptr[low_offset + low_strides.y] = tr.x;
low_ptr[low_offset + low_strides.x] = bl.x;
low_ptr[low_offset + low_strides.x + low_strides.y] = br.x;
}

// Write highpass output
int2 high_coord = (int2)(get_global_id(0), get_global_id(1));
if(!any(high_coord < 0) && !any(high_coord >= high_shape.xy)) {
int2 high_offsets = mul24(high_coord.xy, high_strides.xy);
int high_offset = high_offsets.x + high_offsets.y;

// p = (tl + j*tr) / sqrt(2)
// q = (br - j*bl) / sqrt(2)
// z1 = p - q, z2 = p + q
//
// => z1 = ((tl-br) + j*(tr+bl)) / sqrt2
// => z2 = ((tl+br) + j*(tr-bl)) / sqrt2

float sqrt_half = sqrt(0.5);
vec3_t z1_real = (tl.yzw-br.yzw) * sqrt_half;
vec3_t z1_imag = (tr.yzw+bl.yzw) * sqrt_half;
vec3_t z2_real = (tl.yzw+br.yzw) * sqrt_half;
vec3_t z2_imag = (tr.yzw-bl.yzw) * sqrt_half;

high_ptr[high_offset + 0*high_strides.z] = (complex_t)(z1_real.y, z1_imag.y);
high_ptr[high_offset + 1*high_strides.z] = (complex_t)(z1_real.z, z1_imag.z);
high_ptr[high_offset + 2*high_strides.z] = (complex_t)(z1_real.x, z1_imag.x);
high_ptr[high_offset + 3*high_strides.z] = (complex_t)(z2_real.x, z2_imag.x);
high_ptr[high_offset + 4*high_strides.z] = (complex_t)(z2_real.z, z2_imag.z);
high_ptr[high_offset + 5*high_strides.z] = (complex_t)(z2_real.y, z2_imag.y);
}
}

// vim:sw=4:sts=4:et
63 changes: 55 additions & 8 deletions dtcwt/opencl/transform2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
"""
import logging
import os

import numpy as np

try:
Expand All @@ -15,6 +17,7 @@

from dtcwt.numpy.common import Pyramid as NumPyPyramid
from dtcwt.numpy.transform2d import DEFAULT_BIORT, DEFAULT_QSHIFT
from dtcwt.opencl.util import array_to_spec, global_and_local_size, good_chunk_size_for_queue

import dtcwt.opencl.convolve as convolve
import dtcwt.opencl.coeffs as coeffs
Expand Down Expand Up @@ -88,6 +91,36 @@ def lowpass(self):
def highpasses(self):
return tuple(x.get() for x in self.cl_highpasses)

class _Q2C(object):
PROGRAM_PATH = os.path.join(os.path.dirname(__file__), 'q2c.cl')
def __init__(self, queue):
self.queue = queue
self.chunk_size = good_chunk_size_for_queue(queue)
self.program = self._build_program()
self.q2c = self.program.q2c

def _build_program(self):
constants = { 'CHUNK_SIZE': self.chunk_size, }
options = list('-D{0}={1}'.format(k,v) for k,v in constants.items())
program = cl.Program(self.queue.context, open(_Q2C.PROGRAM_PATH).read())
program.build(options)
return program

def __call__(self, input_array, low_array, high_array, wait_for=None):
assert len(low_array.shape) == 2
assert len(high_array.shape) == 3 and high_array.shape[2] >= 6

in_data, in_offset, in_strides, in_shape = array_to_spec(input_array)
low_data, low_offset, low_strides, low_shape = array_to_spec(low_array)
high_data, high_offset, high_strides, high_shape = array_to_spec(high_array)
global_size, local_size = global_and_local_size(high_array.shape, self.chunk_size)

return self.q2c(self.queue, global_size, local_size,
in_data, in_offset, in_strides, in_shape,
low_data, low_offset, low_strides, low_shape,
high_data, high_offset, high_strides, high_shape,
wait_for=wait_for)

class Transform2d(object):
"""OpenCL implementation of 2D DTCWT.
Expand All @@ -104,8 +137,10 @@ def __init__(self, biort=DEFAULT_BIORT, qshift=DEFAULT_QSHIFT, queue=None):
self._queue = queue
self._biort_coeffs = coeffs.biort(biort)
self._input_dtype = np.float32
self._l1_convolution = convolve.Convolution1D(
self._queue, self._biort_coeffs, self._input_dtype)
self._l1_convolution = convolve.Convolution2D(self._queue, self._biort_coeffs)

# Prepare the q2c kernel
self._q2c = _Q2C(self._queue)

def forward(self, X, nlevels=3, include_scale=False, wait_for=None):
if include_scale:
Expand Down Expand Up @@ -147,10 +182,22 @@ def _level1_transform(self, X, wait_for):
if np.any(np.asarray(X.shape[:2]) % 2 == 1):
raise ValueError('Input must have even rows and columns')

# Do the first dimension convolution
lohi = cla.empty(self._queue, X.shape, dtype=cla.vec.float2)
evt = self._l1_convolution(X, lohi, wait_for=wait_for)
# Allocate workspace for convolution
ws_size = self._l1_convolution.workspace_size_for_input(X)
ws = cl.Buffer(self._queue.context, cl.mem_flags.READ_WRITE, ws_size)

# Create convolution output
output_array = cla.empty(self._queue, X.shape, self._l1_convolution.output_dtype)

# Do the convolution
evt = self._l1_convolution(X, output_array, ws)

# Create low and highpass output
half_shape = tuple(x>>1 for x in X.shape)
lowpass = cla.empty(self._queue, X.shape, np.float32)
highpass = cla.empty(self._queue, half_shape + (6,), np.complex64)

# Extract low- and highpass from convolution output
evt = self._q2c(output_array, lowpass, highpass, wait_for=[evt])

lp = cla.empty(self._queue, X.shape, dtype=np.float32)
hp = cla.empty(self._queue, X.shape + (6,), dtype=np.complex64)
return lp, hp, evt
return lowpass, highpass, evt
4 changes: 2 additions & 2 deletions tests/testopenclxfm2.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,5 +46,5 @@ def test_level1_forward_transform():
# Perform opencl level 1 transform
ocl = t.forward(plane_device, nlevels=1)

# TODO: Compare result
# assert_pyramids_almost_equal(gold, ocl)
# Compare result
assert_pyramids_almost_equal(gold, ocl)

0 comments on commit f8da946

Please sign in to comment.