Skip to content

Commit

Permalink
hello_fft: Update to latest version from Andrew
Browse files Browse the repository at this point in the history
  • Loading branch information
popcornmix committed Sep 15, 2014
1 parent 0cbcb3a commit 83a47aa
Show file tree
Hide file tree
Showing 42 changed files with 9,557 additions and 5,482 deletions.
102 changes: 39 additions & 63 deletions host_applications/linux/apps/hello_pi/hello_fft/gpu_fft.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/*
Copyright (c) 2013, Andrew Holme.
BCM2835 "GPU_FFT" release 2.0 BETA
Copyright (c) 2014, Andrew Holme.
All rights reserved.
Redistribution and use in source and binary forms, with or without
Expand All @@ -26,90 +27,69 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#include <string.h>
#include <stdio.h>

#include "gpu_fft.h"
#include "mailbox.h"

#define GPU_FFT_MEM_FLG 0xC // cached=0xC; direct=0x4
#define GPU_FFT_MEM_MAP 0x0 // cached=0x0; direct=0x20000000
#define GPU_FFT_BUSY_WAIT_LIMIT (5<<12) // ~1ms

typedef struct GPU_FFT_COMPLEX COMPLEX;

struct GPU_FFT_PTR {
unsigned vc;
union { COMPLEX *cptr;
void *vptr;
char *bptr;
float *fptr;
unsigned *uptr; } arm;
};

static unsigned advance (
struct GPU_FFT_PTR *ptr,
int bytes) {

unsigned vc = ptr->vc;
ptr->vc += bytes;
ptr->arm.bptr += bytes;
return vc;
}

int gpu_fft_prepare(
int mb, // mailbox file_desc
int log2_N, // log2(FFT_length) = 8...17
int log2_N, // log2(FFT_length) = 8...20
int direction, // GPU_FFT_FWD: fft(); GPU_FFT_REV: ifft()
int jobs, // number of transforms in batch
struct GPU_FFT **fft) {

unsigned info_bytes, twid_bytes, data_bytes, code_bytes, unif_bytes, mail_bytes;
unsigned size, handle, *uptr, vc_tw, vc_code, vc_data, vc_unifs[GPU_FFT_QPUS];
int i, q, shared, unique, passes;
unsigned size, *uptr, vc_tw, vc_data;
int i, q, shared, unique, passes, ret;

struct GPU_FFT_BASE *base;
struct GPU_FFT_PTR ptr;
struct GPU_FFT *info;

if (qpu_enable(mb, 1)) return -1;

if (gpu_fft_twiddle_size(log2_N, &shared, &unique, &passes)) return -2;

info_bytes = 4096;
data_bytes = (1+((sizeof(COMPLEX)<<log2_N)|4095));
code_bytes = gpu_fft_shader_size(log2_N);
twid_bytes = sizeof(COMPLEX)*16*(shared+GPU_FFT_QPUS*unique);
unif_bytes = sizeof(int)*GPU_FFT_QPUS*(5+jobs*2);
mail_bytes = sizeof(int)*GPU_FFT_QPUS*2;
info_bytes = sizeof(struct GPU_FFT);

size = data_bytes*jobs*2 + // ping-pong data, aligned
size = info_bytes + // header
data_bytes*jobs*2 + // ping-pong data, aligned
code_bytes + // shader, aligned
twid_bytes + // twiddles
unif_bytes + // uniforms
mail_bytes + // mailbox message
info_bytes; // control
mail_bytes; // mailbox message

// Shared memory
handle = mem_alloc(mb, size, 4096, GPU_FFT_MEM_FLG);
if (!handle) return -3;
ret = gpu_fft_alloc(mb, size, &ptr);
if (ret) return ret;

ptr.vc = mem_lock(mb, handle);
ptr.arm.vptr = mapmem(ptr.vc+GPU_FFT_MEM_MAP, size);
// Header
info = (struct GPU_FFT *) ptr.arm.vptr;
base = (struct GPU_FFT_BASE *) info;
gpu_fft_ptr_inc(&ptr, info_bytes);

// Control header
info = (struct GPU_FFT *) (ptr.arm.bptr + size - info_bytes);
// For transpose
info->x = 1<<log2_N;
info->y = jobs;

// Ping-pong buffers leave results in or out of place
info->in = info->out = ptr.arm.cptr;
info->step = data_bytes / sizeof(COMPLEX);
if (passes&1) info->out += info->step * jobs; // odd => out of place
vc_data = advance(&ptr, data_bytes*jobs*2);
vc_data = gpu_fft_ptr_inc(&ptr, data_bytes*jobs*2);

// Shader code
memcpy(ptr.arm.vptr, gpu_fft_shader_code(log2_N), code_bytes);
vc_code = advance(&ptr, code_bytes);
base->vc_code = gpu_fft_ptr_inc(&ptr, code_bytes);

// Twiddles
gpu_fft_twiddle_data(log2_N, direction, ptr.arm.fptr);
vc_tw = advance(&ptr, twid_bytes);
vc_tw = gpu_fft_ptr_inc(&ptr, twid_bytes);

uptr = ptr.arm.uptr;

Expand All @@ -123,37 +103,33 @@ int gpu_fft_prepare(
*uptr++ = vc_data + data_bytes*i + data_bytes*jobs;
}
*uptr++ = 0;
*uptr++ = (q==0); // IRQ enable, master only
*uptr++ = (q==0); // For mailbox: IRQ enable, master only

vc_unifs[q] = advance(&ptr, sizeof(int)*(5+jobs*2));
base->vc_unifs[q] = gpu_fft_ptr_inc(&ptr, sizeof(int)*(5+jobs*2));
}

// Mailbox message
for (q=0; q<GPU_FFT_QPUS; q++) {
*uptr++ = vc_unifs[q];
*uptr++ = vc_code;
if ((jobs<<log2_N) <= GPU_FFT_BUSY_WAIT_LIMIT) {
// Direct register poking with busy wait
base->vc_msg = 0;
}
info->vc_msg = ptr.vc;
else {
// Mailbox message
for (q=0; q<GPU_FFT_QPUS; q++) {
*uptr++ = base->vc_unifs[q];
*uptr++ = base->vc_code;
}

info->mb = mb;
info->handle = handle;
info->size = size;
info->noflush = 1;
info->timeout = 1000; // ms
base->vc_msg = ptr.vc;
}

*fft = info;
return 0;
}

unsigned gpu_fft_execute(struct GPU_FFT *info) {
return execute_qpu(info->mb, GPU_FFT_QPUS, info->vc_msg, info->noflush, info->timeout);
gpu_fft_base_exec(&info->base, GPU_FFT_QPUS);
}

void gpu_fft_release(struct GPU_FFT *info) {
int mb = info->mb;
unsigned handle = info->handle;
unmapmem(info->in, info->size);
mem_unlock(mb, handle);
mem_free(mb, handle);
qpu_enable(mb, 0);
};
gpu_fft_base_release(&info->base);
}
47 changes: 43 additions & 4 deletions host_applications/linux/apps/hello_pi/hello_fft/gpu_fft.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/*
Copyright (c) 2013, Andrew Holme.
BCM2835 "GPU_FFT" release 2.0 BETA
Copyright (c) 2014, Andrew Holme.
All rights reserved.
Redistribution and use in source and binary forms, with or without
Expand All @@ -25,6 +26,9 @@ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#ifndef __GPU_FFT__
#define __GPU_FFT__

#define GPU_FFT_QPUS 8

#define GPU_FFT_PI 3.14159265358979323846
Expand All @@ -36,15 +40,30 @@ struct GPU_FFT_COMPLEX {
float re, im;
};

struct GPU_FFT_PTR {
unsigned vc;
union { struct GPU_FFT_COMPLEX *cptr;
void *vptr;
char *bptr;
float *fptr;
unsigned *uptr; } arm;
};

struct GPU_FFT_BASE {
int mb;
unsigned handle, size, vc_msg, vc_code, vc_unifs[GPU_FFT_QPUS];
volatile unsigned *peri;
};

struct GPU_FFT {
struct GPU_FFT_BASE base;
struct GPU_FFT_COMPLEX *in, *out;
int mb, step;
unsigned timeout, noflush, handle, size, vc_msg;
int x, y, step;
};

int gpu_fft_prepare(
int mb, // mailbox file_desc
int log2_N, // log2(FFT_length) = 8...17
int log2_N, // log2(FFT_length) = 8...20
int direction, // GPU_FFT_FWD: fft(); GPU_FFT_REV: ifft()
int jobs, // number of transforms in batch
struct GPU_FFT **fft);
Expand All @@ -60,3 +79,23 @@ int gpu_fft_twiddle_size(int, int *, int *, int *);
void gpu_fft_twiddle_data(int, int, float *);
unsigned int gpu_fft_shader_size(int);
unsigned int *gpu_fft_shader_code(int);

// gpu_fft_base:

unsigned gpu_fft_base_exec (
struct GPU_FFT_BASE *base,
int num_qpus);

int gpu_fft_alloc (
int mb,
unsigned size,
struct GPU_FFT_PTR *ptr);

void gpu_fft_base_release(
struct GPU_FFT_BASE *base);

unsigned gpu_fft_ptr_inc (
struct GPU_FFT_PTR *ptr,
int bytes);

#endif // __GPU_FFT__
58 changes: 43 additions & 15 deletions host_applications/linux/apps/hello_pi/hello_fft/gpu_fft.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
BCM2835 "GPU_FFT" by Andrew Holme, 2013.
BCM2835 "GPU_FFT" release 2.0 BETA by Andrew Holme, 2014.

GPU_FFT is an FFT library for the Raspberry Pi which exploits the BCM2835 SoC
3D hardware to deliver ten times more data throughput than is possible on the
700 MHz ARM. Kernels are provided for all power-of-2 FFT lengths between 256
and 131,072 points inclusive.
and 1,048,576 points inclusive. A transpose function, which also uses the 3D
hardware, is provided to support 2-dimensional transforms.


*** Accuracy ***
Expand All @@ -15,22 +16,29 @@ is not scaled. The relative root-mean-square (rms) error in parts-per-million
log2(N) | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17
ppm rms | 0.27 | 0.42 | 0.50 | 0.70 | 2.3 | 4.4 | 7.6 | 9.2 | 18 | 70

log2(N) | 18 | 19 | 20 | 8...17 batch of 10
ppm rms | 100 | 180 | 360 | 18...20 batch of 1


*** Throughput ***

GPU_FFT is invoked through a kernel ioctl call which adds 100us overhead. To
mitigate this, transform batches can be executed with a single call. Typical
per-transform runtime in microseconds for various batch sizes and comparative
figures for FFTW (FFTW_MEASURE mode) are as follows:
GPU_FFT 1.0 had to be invoked through a "mailbox" which added a 100us overhead
on every call. To mitigate this, batches of transforms could be submitted via
a single call. GPU_FFT 2.0 avoids this 100us overhead by poking GPU registers
directly from the ARM if total batch runtime will be short; but still uses the
mailbox for longer jobs to avoid busy waiting at 100% CPU for too long.

Typical per-transform runtimes for batch sizes of 1 and 10; and comparative
figures for FFTW (FFTW_MEASURE mode) are:

log2(N) | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 |
1 | 0.036 | 0.051 | 0.070 | 0.11 | 0.24 | 0.58 | 1.2 | 3.3 |
10 | 0.016 | 0.027 | 0.045 | 0.095 | 0.25 | 0.61 | 1.2 | 3.2 |
FFTW | 0.092 | 0.22 | 0.48 | 0.95 | 3.0 | 5.1 | 12 | 31 |

log2(N) | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17
1 | 112 | 125 | 136 | 180 | 298 | 689 | 1274 | 3397 | 6978 | 16734
2 | 56 | 74 | 85 | 133 | 285 | 663 | 1227 | 3362 | 6759 | 16179
5 | 31 | 75 | 61 | 113 | 274 | 631 | 1188 | 3228 | 6693 | 16180
10 | 22 | 37 | 54 | 107 | 256 | 624 | 1167 | 3225 | 6703 | 16110
20 | 19 | 31 | 52 | 101 | 252 | 615 | 1138 | 3202 | 6684 | 16181
50 | 16 | 26 | 45 | 93 | 240 | 608 | 1131 | 3196 | 6674 | 16171
FFTW | 92 | 217 | 482 | 952 | 3002 | 5082 | 12005 | 31211 | 82769 | 183731
log2(N) | 16 | 17 | 18 | 19 | 20 | All times in
1 | 6.8 | 16 | 42 | 95 | 190 | milliseconds
FFTW | 83 | 180 | 560 | 670 | 1600 | 2 sig. figs.


*** API functions ***
Expand All @@ -49,7 +57,7 @@ log2(N) | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17

int mb Mailbox file descriptor obtained by calling mbox_open()

int log2_N log2(FFT length) = 8 to 17
int log2_N log2(FFT length) = 8 to 20

int direction FFT direction: GPU_FFT_FWD for forward FFT
GPU_FFT_REV for inverse FFT
Expand Down Expand Up @@ -127,3 +135,23 @@ waiting for rendering, call glFlush() and glFinish() afterwards as follows:
gpu_fft_execute(....); // blocking call
....
}


*** 2-dimensional FFT ***

Please study the hello_fft_2d demo source, which is built and executed thus:

make hello_fft_2d.bin
sudo ./hello_fft_2d.bin

This generates a Windows BMP file: "hello_fft_2d.bmp"

The demo uses a square 512x512 array; however, rectangular arrays are allowed.
The following lines in gpu_fft_trans.c will do what is safe:

ptr.arm.uptr[6] = src->x < dst->y? src->x : dst->y;
ptr.arm.uptr[7] = src->y < dst->x? src->y : dst->x;

One may transpose the output from the second FFT pass back into the first pass
input buffer, by preparing and executing a second transposition; however, this
is probably unnecessary. It depends on how the final output will be accessed.
Loading

0 comments on commit 83a47aa

Please sign in to comment.