hello_fft: Update to latest version from Andrew

See: http://www.aholme.co.uk/GPU_FFT/Main.htm
raspberrypi · Sep 15, 2014 · 83a47aa · 83a47aa
1 parent 0cbcb3a
commit 83a47aa
Show file tree

Hide file tree

Showing 42 changed files with 9,557 additions and 5,482 deletions.
diff --git a/host_applications/linux/apps/hello_pi/hello_fft/gpu_fft.c b/host_applications/linux/apps/hello_pi/hello_fft/gpu_fft.c
@@ -1,5 +1,6 @@
 /*
-Copyright (c) 2013, Andrew Holme.
+BCM2835 "GPU_FFT" release 2.0 BETA
+Copyright (c) 2014, Andrew Holme.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -26,90 +27,69 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
 #include <string.h>
-#include <stdio.h>
 
 #include "gpu_fft.h"
-#include "mailbox.h"
 
-#define GPU_FFT_MEM_FLG 0xC // cached=0xC; direct=0x4
-#define GPU_FFT_MEM_MAP 0x0 // cached=0x0; direct=0x20000000
+#define GPU_FFT_BUSY_WAIT_LIMIT (5<<12) // ~1ms
 
 typedef struct GPU_FFT_COMPLEX COMPLEX;
 
-struct GPU_FFT_PTR {
-    unsigned vc;
-    union { COMPLEX  *cptr;
-            void     *vptr;
-            char     *bptr;
-            float    *fptr;
-            unsigned *uptr; } arm;
-};
-
-static unsigned advance (
-    struct GPU_FFT_PTR *ptr,
-    int bytes) {
-
-    unsigned vc = ptr->vc;
-    ptr->vc += bytes;
-    ptr->arm.bptr += bytes;
-    return vc;
-}
-
 int gpu_fft_prepare(
     int mb,         // mailbox file_desc
-    int log2_N,     // log2(FFT_length) = 8...17
+    int log2_N,     // log2(FFT_length) = 8...20
     int direction,  // GPU_FFT_FWD: fft(); GPU_FFT_REV: ifft()
     int jobs,       // number of transforms in batch
     struct GPU_FFT **fft) {
 
     unsigned info_bytes, twid_bytes, data_bytes, code_bytes, unif_bytes, mail_bytes;
-    unsigned size, handle, *uptr, vc_tw, vc_code, vc_data, vc_unifs[GPU_FFT_QPUS];
-    int i, q, shared, unique, passes;
+    unsigned size, *uptr, vc_tw, vc_data;
+    int i, q, shared, unique, passes, ret;
 
+    struct GPU_FFT_BASE *base;
     struct GPU_FFT_PTR ptr;
     struct GPU_FFT *info;
 
-    if (qpu_enable(mb, 1)) return -1;
-
     if (gpu_fft_twiddle_size(log2_N, &shared, &unique, &passes)) return -2;
 
+    info_bytes = 4096;
     data_bytes = (1+((sizeof(COMPLEX)<<log2_N)|4095));
     code_bytes = gpu_fft_shader_size(log2_N);
     twid_bytes = sizeof(COMPLEX)*16*(shared+GPU_FFT_QPUS*unique);
     unif_bytes = sizeof(int)*GPU_FFT_QPUS*(5+jobs*2);
     mail_bytes = sizeof(int)*GPU_FFT_QPUS*2;
-    info_bytes = sizeof(struct GPU_FFT);
 
-    size  = data_bytes*jobs*2 + // ping-pong data, aligned
+    size  = info_bytes +        // header
+            data_bytes*jobs*2 + // ping-pong data, aligned
             code_bytes +        // shader, aligned
             twid_bytes +        // twiddles
             unif_bytes +        // uniforms
-            mail_bytes +        // mailbox message
-            info_bytes;         // control
+            mail_bytes;         // mailbox message
 
-    // Shared memory
-    handle = mem_alloc(mb, size, 4096, GPU_FFT_MEM_FLG);
-    if (!handle) return -3;
+    ret = gpu_fft_alloc(mb, size, &ptr);
+    if (ret) return ret;
 
-    ptr.vc = mem_lock(mb, handle);
-    ptr.arm.vptr = mapmem(ptr.vc+GPU_FFT_MEM_MAP, size);
+    // Header
+    info = (struct GPU_FFT *) ptr.arm.vptr;
+    base = (struct GPU_FFT_BASE *) info;
+    gpu_fft_ptr_inc(&ptr, info_bytes);
 
-    // Control header
-    info = (struct GPU_FFT *) (ptr.arm.bptr + size - info_bytes);
+    // For transpose
+    info->x = 1<<log2_N;
+    info->y = jobs;
 
     // Ping-pong buffers leave results in or out of place
     info->in = info->out = ptr.arm.cptr;
     info->step = data_bytes / sizeof(COMPLEX);
     if (passes&1) info->out += info->step * jobs; // odd => out of place
-    vc_data = advance(&ptr, data_bytes*jobs*2);
+    vc_data = gpu_fft_ptr_inc(&ptr, data_bytes*jobs*2);
 
     // Shader code
     memcpy(ptr.arm.vptr, gpu_fft_shader_code(log2_N), code_bytes);
-    vc_code = advance(&ptr, code_bytes);
+    base->vc_code = gpu_fft_ptr_inc(&ptr, code_bytes);
 
     // Twiddles
     gpu_fft_twiddle_data(log2_N, direction, ptr.arm.fptr);
-    vc_tw = advance(&ptr, twid_bytes);
+    vc_tw = gpu_fft_ptr_inc(&ptr, twid_bytes);
 
     uptr = ptr.arm.uptr;
 
@@ -123,37 +103,33 @@ int gpu_fft_prepare(
             *uptr++ = vc_data + data_bytes*i + data_bytes*jobs;
         }
         *uptr++ = 0;
-        *uptr++ = (q==0); // IRQ enable, master only
+        *uptr++ = (q==0); // For mailbox: IRQ enable, master only
 
-        vc_unifs[q] = advance(&ptr, sizeof(int)*(5+jobs*2));
+        base->vc_unifs[q] = gpu_fft_ptr_inc(&ptr, sizeof(int)*(5+jobs*2));
     }
 
-    // Mailbox message
-    for (q=0; q<GPU_FFT_QPUS; q++) {
-        *uptr++ = vc_unifs[q];
-        *uptr++ = vc_code;
+    if ((jobs<<log2_N) <= GPU_FFT_BUSY_WAIT_LIMIT) {
+        // Direct register poking with busy wait
+        base->vc_msg = 0;
     }
-    info->vc_msg = ptr.vc;
+    else {
+        // Mailbox message
+        for (q=0; q<GPU_FFT_QPUS; q++) {
+            *uptr++ = base->vc_unifs[q];
+            *uptr++ = base->vc_code;
+        }
 
-    info->mb      = mb;
-    info->handle  = handle;
-    info->size    = size;
-    info->noflush = 1;
-    info->timeout = 1000; // ms
+        base->vc_msg = ptr.vc;
+    }
 
     *fft = info;
     return 0;
 }
 
 unsigned gpu_fft_execute(struct GPU_FFT *info) {
-    return execute_qpu(info->mb, GPU_FFT_QPUS, info->vc_msg, info->noflush, info->timeout);
+    gpu_fft_base_exec(&info->base, GPU_FFT_QPUS);
 }
 
 void gpu_fft_release(struct GPU_FFT *info) {
-    int mb = info->mb;
-    unsigned handle = info->handle;
-    unmapmem(info->in, info->size);
-    mem_unlock(mb, handle);
-    mem_free(mb, handle);
-    qpu_enable(mb, 0);
-};
+    gpu_fft_base_release(&info->base);
+}
diff --git a/host_applications/linux/apps/hello_pi/hello_fft/gpu_fft.h b/host_applications/linux/apps/hello_pi/hello_fft/gpu_fft.h
@@ -1,5 +1,6 @@
 /*
-Copyright (c) 2013, Andrew Holme.
+BCM2835 "GPU_FFT" release 2.0 BETA
+Copyright (c) 2014, Andrew Holme.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -25,6 +26,9 @@ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
+#ifndef __GPU_FFT__
+#define __GPU_FFT__
+
 #define GPU_FFT_QPUS 8
 
 #define GPU_FFT_PI 3.14159265358979323846
@@ -36,15 +40,30 @@ struct GPU_FFT_COMPLEX {
     float re, im;
 };
 
+struct GPU_FFT_PTR {
+    unsigned vc;
+    union { struct GPU_FFT_COMPLEX *cptr;
+            void                   *vptr;
+            char                   *bptr;
+            float                  *fptr;
+            unsigned               *uptr; } arm;
+};
+
+struct GPU_FFT_BASE {
+    int mb;
+    unsigned handle, size, vc_msg, vc_code, vc_unifs[GPU_FFT_QPUS];
+    volatile unsigned *peri;
+};
+
 struct GPU_FFT {
+    struct GPU_FFT_BASE base;
     struct GPU_FFT_COMPLEX *in, *out;
-    int mb, step;
-    unsigned timeout, noflush, handle, size, vc_msg;
+    int x, y, step;
 };
 
 int gpu_fft_prepare(
     int mb,         // mailbox file_desc
-    int log2_N,     // log2(FFT_length) = 8...17
+    int log2_N,     // log2(FFT_length) = 8...20
     int direction,  // GPU_FFT_FWD: fft(); GPU_FFT_REV: ifft()
     int jobs,       // number of transforms in batch
     struct GPU_FFT **fft);
@@ -60,3 +79,23 @@ int           gpu_fft_twiddle_size(int, int *, int *, int *);
 void          gpu_fft_twiddle_data(int, int, float *);
 unsigned int  gpu_fft_shader_size(int);
 unsigned int *gpu_fft_shader_code(int);
+
+// gpu_fft_base:
+
+unsigned gpu_fft_base_exec (
+    struct GPU_FFT_BASE *base,
+    int num_qpus);
+
+int gpu_fft_alloc (
+    int mb,
+    unsigned size,
+    struct GPU_FFT_PTR *ptr);
+
+void gpu_fft_base_release(
+    struct GPU_FFT_BASE *base);
+
+unsigned gpu_fft_ptr_inc (
+    struct GPU_FFT_PTR *ptr,
+    int bytes);
+
+#endif // __GPU_FFT__
diff --git a/host_applications/linux/apps/hello_pi/hello_fft/gpu_fft.txt b/host_applications/linux/apps/hello_pi/hello_fft/gpu_fft.txt
@@ -1,9 +1,10 @@
-BCM2835 "GPU_FFT" by Andrew Holme, 2013.
+BCM2835 "GPU_FFT" release 2.0 BETA by Andrew Holme, 2014.
 
 GPU_FFT is an FFT library for the Raspberry Pi which exploits the BCM2835 SoC
 3D hardware to deliver ten times more data throughput than is possible on the
 700 MHz ARM.  Kernels are provided for all power-of-2 FFT lengths between 256
-and 131,072 points inclusive.
+and 1,048,576 points inclusive.  A transpose function, which also uses the 3D
+hardware, is provided to support 2-dimensional transforms.
 
 
 *** Accuracy ***
@@ -15,22 +16,29 @@ is not scaled.  The relative root-mean-square (rms) error in parts-per-million
 log2(N) |  8    | 9    | 10   |  11   |  12  |  13  |  14  |  15  |  16 |  17
 ppm rms |  0.27 | 0.42 | 0.50 |  0.70 |  2.3 |  4.4 |  7.6 |  9.2 |  18 |  70
 
+log2(N) |  18 |  19 |  20 |                 8...17 batch of 10
+ppm rms | 100 | 180 | 360 |                18...20 batch of  1
+
 
 *** Throughput ***
 
-GPU_FFT is invoked through a kernel ioctl call which adds 100us overhead.  To
-mitigate this, transform batches can be executed with a single call.  Typical
-per-transform runtime in microseconds for various batch sizes and comparative
-figures for FFTW (FFTW_MEASURE mode) are as follows:
+GPU_FFT 1.0 had to be invoked through a "mailbox" which added a 100us overhead
+on every call.  To mitigate this, batches of transforms could be submitted via
+a single call.  GPU_FFT 2.0 avoids this 100us overhead by poking GPU registers
+directly from the ARM if total batch runtime will be short; but still uses the
+mailbox for longer jobs to avoid busy waiting at 100% CPU for too long.
+
+Typical per-transform runtimes for batch sizes of 1 and 10; and comparative
+figures for FFTW (FFTW_MEASURE mode) are:
+
+log2(N) |   8   |   9   |  10   |  11   |  12  |  13  |  14  |  15  |
+      1 | 0.036 | 0.051 | 0.070 | 0.11  | 0.24 | 0.58 |  1.2 |  3.3 |
+     10 | 0.016 | 0.027 | 0.045 | 0.095 | 0.25 | 0.61 |  1.2 |  3.2 |
+   FFTW | 0.092 | 0.22  | 0.48  | 0.95  | 3.0  | 5.1  | 12   | 31   |
 
-log2(N) |   8 |   9 |  10 |  11 |   12 |   13 |    14 |    15 |    16 |     17
-      1 | 112 | 125 | 136 | 180 |  298 |  689 |  1274 |  3397 |  6978 |  16734
-      2 |  56 |  74 |  85 | 133 |  285 |  663 |  1227 |  3362 |  6759 |  16179
-      5 |  31 |  75 |  61 | 113 |  274 |  631 |  1188 |  3228 |  6693 |  16180
-     10 |  22 |  37 |  54 | 107 |  256 |  624 |  1167 |  3225 |  6703 |  16110
-     20 |  19 |  31 |  52 | 101 |  252 |  615 |  1138 |  3202 |  6684 |  16181
-     50 |  16 |  26 |  45 |  93 |  240 |  608 |  1131 |  3196 |  6674 |  16171
-   FFTW |  92 | 217 | 482 | 952 | 3002 | 5082 | 12005 | 31211 | 82769 | 183731
+log2(N) |  16  |  17 |  18 |  19 |   20 |           All times in
+      1 |  6.8 |  16 |  42 |  95 |  190 |           milliseconds
+   FFTW | 83   | 180 | 560 | 670 | 1600 |           2 sig. figs.
 
 
 *** API functions ***
@@ -49,7 +57,7 @@ log2(N) |   8 |   9 |  10 |  11 |   12 |   13 |    14 |    15 |    16 |     17
 
     int mb          Mailbox file descriptor obtained by calling mbox_open()
 
-    int log2_N      log2(FFT length) = 8 to 17
+    int log2_N      log2(FFT length) = 8 to 20
 
     int direction   FFT direction:  GPU_FFT_FWD for forward FFT
                                     GPU_FFT_REV for inverse FFT
@@ -127,3 +135,23 @@ waiting for rendering, call glFlush() and glFinish() afterwards as follows:
         gpu_fft_execute(....); // blocking call
         ....
     }
+
+
+*** 2-dimensional FFT ***
+
+Please study the hello_fft_2d demo source, which is built and executed thus:
+
+make hello_fft_2d.bin
+sudo ./hello_fft_2d.bin
+
+This generates a Windows BMP file: "hello_fft_2d.bmp"
+
+The demo uses a square 512x512 array; however, rectangular arrays are allowed.
+The following lines in gpu_fft_trans.c will do what is safe:
+
+    ptr.arm.uptr[6] = src->x < dst->y? src->x : dst->y;
+    ptr.arm.uptr[7] = src->y < dst->x? src->y : dst->x;
+
+One may transpose the output from the second FFT pass back into the first pass
+input buffer, by preparing and executing a second transposition; however, this
+is probably unnecessary.  It depends on how the final output will be accessed.