Skip to content

Commit

Permalink
kernel: config: Enable CONFIG_HID_ELO for improved touchscreen support
Browse files Browse the repository at this point in the history
See: raspberrypi/linux#680

kernel: I2C: Only register the I2C device for the current board revision
See: http://www.raspberrypi.org/forums/viewtopic.php?f=43&t=84000

kernel: power: Add defines for contolling DSI power through mailbox interface
kernel: vcio: Fix incorrect and add new mailbox tags
kernel: bcm2708_fb: Add pan and vsync controls
See: raspberrypi/linux#679

firmware: Improved editing of DT node
This now will create a new node if it doesn't exist, and now appends to bootargs rather than replacing.
Note: padding is still required

firmware: platform: Add support for generic DPI display
More info soon

firmware: i2c: Make sure that sda pins and scl pins are set correctly when the I2C driver is opened

firmware: arm_loader: Allow DSI power to be controlled from arm

firmware: arm_display: Power off hdmi output when framebuffer interface is blanked

firmware: arm_display: Make syncs optional on dispman updates and add a mailbox option to wait for vsync
firmware: arm_display: Avoid freeing the framebuffer when allocated size doesn't change

hello_pi: hello_fft: Update to latest version from Andrew
See: http://www.aholme.co.uk/GPU_FFT/Main.htm
  • Loading branch information
popcornmix committed Sep 4, 2014
1 parent 1bb122b commit e45f94d
Show file tree
Hide file tree
Showing 117 changed files with 53,048 additions and 44,889 deletions.
Binary file modified boot/fixup.dat
Binary file not shown.
Binary file modified boot/fixup_cd.dat
Binary file not shown.
Binary file modified boot/fixup_x.dat
Binary file not shown.
Binary file modified boot/kernel.img
Binary file not shown.
Binary file modified boot/start.elf
Binary file not shown.
Binary file modified boot/start_cd.elf
Binary file not shown.
Binary file modified boot/start_x.elf
Binary file not shown.
67,851 changes: 33,928 additions & 33,923 deletions extra/System.map

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion extra/git_hash
@@ -1 +1 @@
b65f7df60146ce8513ce64b9d7acd4a026878ee5
e50d6adf1df06a1d4f8e5938c23ed7c3502ed02d
2 changes: 1 addition & 1 deletion extra/uname_string
@@ -1 +1 @@
Linux version 3.12.26+ (dc4@dc4-XPS13-9333) (gcc version 4.8.3 20140303 (prerelease) (crosstool-NG linaro-1.13.1+bzr2650 - Linaro GCC 2014.03) ) #707 PREEMPT Sat Aug 30 17:39:19 BST 2014
Linux version 3.12.26+ (dc4@dc4-XPS13-9333) (gcc version 4.8.3 20140303 (prerelease) (crosstool-NG linaro-1.13.1+bzr2650 - Linaro GCC 2014.03) ) #708 PREEMPT Thu Sep 4 15:43:15 BST 2014
Binary file modified hardfp/opt/vc/lib/libEGL_static.a
Binary file not shown.
Binary file modified hardfp/opt/vc/lib/libGLESv2_static.a
Binary file not shown.
Binary file modified hardfp/opt/vc/lib/libkhrn_client.a
Binary file not shown.
Binary file modified hardfp/opt/vc/lib/libkhrn_static.a
Binary file not shown.
Binary file modified hardfp/opt/vc/lib/libvcfiled_check.a
Binary file not shown.
Binary file modified hardfp/opt/vc/lib/libvchostif.a
Binary file not shown.
Binary file modified hardfp/opt/vc/lib/libvcilcs.a
Binary file not shown.
Binary file modified hardfp/opt/vc/lib/libvmcs_rpc_client.a
Binary file not shown.
102 changes: 39 additions & 63 deletions hardfp/opt/vc/src/hello_pi/hello_fft/gpu_fft.c
@@ -1,5 +1,6 @@
/*
Copyright (c) 2013, Andrew Holme.
BCM2835 "GPU_FFT" release 2.0 BETA
Copyright (c) 2014, Andrew Holme.
All rights reserved.
Redistribution and use in source and binary forms, with or without
Expand All @@ -26,90 +27,69 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#include <string.h>
#include <stdio.h>

#include "gpu_fft.h"
#include "mailbox.h"

#define GPU_FFT_MEM_FLG 0xC // cached=0xC; direct=0x4
#define GPU_FFT_MEM_MAP 0x0 // cached=0x0; direct=0x20000000
#define GPU_FFT_BUSY_WAIT_LIMIT (5<<12) // ~1ms

typedef struct GPU_FFT_COMPLEX COMPLEX;

struct GPU_FFT_PTR {
unsigned vc;
union { COMPLEX *cptr;
void *vptr;
char *bptr;
float *fptr;
unsigned *uptr; } arm;
};

static unsigned advance (
struct GPU_FFT_PTR *ptr,
int bytes) {

unsigned vc = ptr->vc;
ptr->vc += bytes;
ptr->arm.bptr += bytes;
return vc;
}

int gpu_fft_prepare(
int mb, // mailbox file_desc
int log2_N, // log2(FFT_length) = 8...17
int log2_N, // log2(FFT_length) = 8...20
int direction, // GPU_FFT_FWD: fft(); GPU_FFT_REV: ifft()
int jobs, // number of transforms in batch
struct GPU_FFT **fft) {

unsigned info_bytes, twid_bytes, data_bytes, code_bytes, unif_bytes, mail_bytes;
unsigned size, handle, *uptr, vc_tw, vc_code, vc_data, vc_unifs[GPU_FFT_QPUS];
int i, q, shared, unique, passes;
unsigned size, *uptr, vc_tw, vc_data;
int i, q, shared, unique, passes, ret;

struct GPU_FFT_BASE *base;
struct GPU_FFT_PTR ptr;
struct GPU_FFT *info;

if (qpu_enable(mb, 1)) return -1;

if (gpu_fft_twiddle_size(log2_N, &shared, &unique, &passes)) return -2;

info_bytes = 4096;
data_bytes = (1+((sizeof(COMPLEX)<<log2_N)|4095));
code_bytes = gpu_fft_shader_size(log2_N);
twid_bytes = sizeof(COMPLEX)*16*(shared+GPU_FFT_QPUS*unique);
unif_bytes = sizeof(int)*GPU_FFT_QPUS*(5+jobs*2);
mail_bytes = sizeof(int)*GPU_FFT_QPUS*2;
info_bytes = sizeof(struct GPU_FFT);

size = data_bytes*jobs*2 + // ping-pong data, aligned
size = info_bytes + // header
data_bytes*jobs*2 + // ping-pong data, aligned
code_bytes + // shader, aligned
twid_bytes + // twiddles
unif_bytes + // uniforms
mail_bytes + // mailbox message
info_bytes; // control
mail_bytes; // mailbox message

// Shared memory
handle = mem_alloc(mb, size, 4096, GPU_FFT_MEM_FLG);
if (!handle) return -3;
ret = gpu_fft_alloc(mb, size, &ptr);
if (ret) return ret;

ptr.vc = mem_lock(mb, handle);
ptr.arm.vptr = mapmem(ptr.vc+GPU_FFT_MEM_MAP, size);
// Header
info = (struct GPU_FFT *) ptr.arm.vptr;
base = (struct GPU_FFT_BASE *) info;
gpu_fft_ptr_inc(&ptr, info_bytes);

// Control header
info = (struct GPU_FFT *) (ptr.arm.bptr + size - info_bytes);
// For transpose
info->x = 1<<log2_N;
info->y = jobs;

// Ping-pong buffers leave results in or out of place
info->in = info->out = ptr.arm.cptr;
info->step = data_bytes / sizeof(COMPLEX);
if (passes&1) info->out += info->step * jobs; // odd => out of place
vc_data = advance(&ptr, data_bytes*jobs*2);
vc_data = gpu_fft_ptr_inc(&ptr, data_bytes*jobs*2);

// Shader code
memcpy(ptr.arm.vptr, gpu_fft_shader_code(log2_N), code_bytes);
vc_code = advance(&ptr, code_bytes);
base->vc_code = gpu_fft_ptr_inc(&ptr, code_bytes);

// Twiddles
gpu_fft_twiddle_data(log2_N, direction, ptr.arm.fptr);
vc_tw = advance(&ptr, twid_bytes);
vc_tw = gpu_fft_ptr_inc(&ptr, twid_bytes);

uptr = ptr.arm.uptr;

Expand All @@ -123,37 +103,33 @@ int gpu_fft_prepare(
*uptr++ = vc_data + data_bytes*i + data_bytes*jobs;
}
*uptr++ = 0;
*uptr++ = (q==0); // IRQ enable, master only
*uptr++ = (q==0); // For mailbox: IRQ enable, master only

vc_unifs[q] = advance(&ptr, sizeof(int)*(5+jobs*2));
base->vc_unifs[q] = gpu_fft_ptr_inc(&ptr, sizeof(int)*(5+jobs*2));
}

// Mailbox message
for (q=0; q<GPU_FFT_QPUS; q++) {
*uptr++ = vc_unifs[q];
*uptr++ = vc_code;
if ((jobs<<log2_N) <= GPU_FFT_BUSY_WAIT_LIMIT) {
// Direct register poking with busy wait
base->vc_msg = 0;
}
info->vc_msg = ptr.vc;
else {
// Mailbox message
for (q=0; q<GPU_FFT_QPUS; q++) {
*uptr++ = base->vc_unifs[q];
*uptr++ = base->vc_code;
}

info->mb = mb;
info->handle = handle;
info->size = size;
info->noflush = 1;
info->timeout = 1000; // ms
base->vc_msg = ptr.vc;
}

*fft = info;
return 0;
}

unsigned gpu_fft_execute(struct GPU_FFT *info) {
return execute_qpu(info->mb, GPU_FFT_QPUS, info->vc_msg, info->noflush, info->timeout);
gpu_fft_base_exec(&info->base, GPU_FFT_QPUS);
}

void gpu_fft_release(struct GPU_FFT *info) {
int mb = info->mb;
unsigned handle = info->handle;
unmapmem(info->in, info->size);
mem_unlock(mb, handle);
mem_free(mb, handle);
qpu_enable(mb, 0);
};
gpu_fft_base_release(&info->base);
}
47 changes: 43 additions & 4 deletions hardfp/opt/vc/src/hello_pi/hello_fft/gpu_fft.h
@@ -1,5 +1,6 @@
/*
Copyright (c) 2013, Andrew Holme.
BCM2835 "GPU_FFT" release 2.0 BETA
Copyright (c) 2014, Andrew Holme.
All rights reserved.
Redistribution and use in source and binary forms, with or without
Expand All @@ -25,6 +26,9 @@ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#ifndef __GPU_FFT__
#define __GPU_FFT__

#define GPU_FFT_QPUS 8

#define GPU_FFT_PI 3.14159265358979323846
Expand All @@ -36,15 +40,30 @@ struct GPU_FFT_COMPLEX {
float re, im;
};

struct GPU_FFT_PTR {
unsigned vc;
union { struct GPU_FFT_COMPLEX *cptr;
void *vptr;
char *bptr;
float *fptr;
unsigned *uptr; } arm;
};

struct GPU_FFT_BASE {
int mb;
unsigned handle, size, vc_msg, vc_code, vc_unifs[GPU_FFT_QPUS];
volatile unsigned *peri;
};

struct GPU_FFT {
struct GPU_FFT_BASE base;
struct GPU_FFT_COMPLEX *in, *out;
int mb, step;
unsigned timeout, noflush, handle, size, vc_msg;
int x, y, step;
};

int gpu_fft_prepare(
int mb, // mailbox file_desc
int log2_N, // log2(FFT_length) = 8...17
int log2_N, // log2(FFT_length) = 8...20
int direction, // GPU_FFT_FWD: fft(); GPU_FFT_REV: ifft()
int jobs, // number of transforms in batch
struct GPU_FFT **fft);
Expand All @@ -60,3 +79,23 @@ int gpu_fft_twiddle_size(int, int *, int *, int *);
void gpu_fft_twiddle_data(int, int, float *);
unsigned int gpu_fft_shader_size(int);
unsigned int *gpu_fft_shader_code(int);

// gpu_fft_base:

unsigned gpu_fft_base_exec (
struct GPU_FFT_BASE *base,
int num_qpus);

int gpu_fft_alloc (
int mb,
unsigned size,
struct GPU_FFT_PTR *ptr);

void gpu_fft_base_release(
struct GPU_FFT_BASE *base);

unsigned gpu_fft_ptr_inc (
struct GPU_FFT_PTR *ptr,
int bytes);

#endif // __GPU_FFT__
58 changes: 43 additions & 15 deletions hardfp/opt/vc/src/hello_pi/hello_fft/gpu_fft.txt
@@ -1,9 +1,10 @@
BCM2835 "GPU_FFT" by Andrew Holme, 2013.
BCM2835 "GPU_FFT" release 2.0 BETA by Andrew Holme, 2014.

GPU_FFT is an FFT library for the Raspberry Pi which exploits the BCM2835 SoC
3D hardware to deliver ten times more data throughput than is possible on the
700 MHz ARM. Kernels are provided for all power-of-2 FFT lengths between 256
and 131,072 points inclusive.
and 1,048,576 points inclusive. A transpose function, which also uses the 3D
hardware, is provided to support 2-dimensional transforms.


*** Accuracy ***
Expand All @@ -15,22 +16,29 @@ is not scaled. The relative root-mean-square (rms) error in parts-per-million
log2(N) | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17
ppm rms | 0.27 | 0.42 | 0.50 | 0.70 | 2.3 | 4.4 | 7.6 | 9.2 | 18 | 70

log2(N) | 18 | 19 | 20 | 8...17 batch of 10
ppm rms | 100 | 180 | 360 | 18...20 batch of 1


*** Throughput ***

GPU_FFT is invoked through a kernel ioctl call which adds 100us overhead. To
mitigate this, transform batches can be executed with a single call. Typical
per-transform runtime in microseconds for various batch sizes and comparative
figures for FFTW (FFTW_MEASURE mode) are as follows:
GPU_FFT 1.0 had to be invoked through a "mailbox" which added a 100us overhead
on every call. To mitigate this, batches of transforms could be submitted via
a single call. GPU_FFT 2.0 avoids this 100us overhead by poking GPU registers
directly from the ARM if total batch runtime will be short; but still uses the
mailbox for longer jobs to avoid busy waiting at 100% CPU for too long.

Typical per-transform runtimes for batch sizes of 1 and 10; and comparative
figures for FFTW (FFTW_MEASURE mode) are:

log2(N) | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 |
1 | 0.036 | 0.051 | 0.070 | 0.11 | 0.24 | 0.58 | 1.2 | 3.3 |
10 | 0.016 | 0.027 | 0.045 | 0.095 | 0.25 | 0.61 | 1.2 | 3.2 |
FFTW | 0.092 | 0.22 | 0.48 | 0.95 | 3.0 | 5.1 | 12 | 31 |

log2(N) | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17
1 | 112 | 125 | 136 | 180 | 298 | 689 | 1274 | 3397 | 6978 | 16734
2 | 56 | 74 | 85 | 133 | 285 | 663 | 1227 | 3362 | 6759 | 16179
5 | 31 | 75 | 61 | 113 | 274 | 631 | 1188 | 3228 | 6693 | 16180
10 | 22 | 37 | 54 | 107 | 256 | 624 | 1167 | 3225 | 6703 | 16110
20 | 19 | 31 | 52 | 101 | 252 | 615 | 1138 | 3202 | 6684 | 16181
50 | 16 | 26 | 45 | 93 | 240 | 608 | 1131 | 3196 | 6674 | 16171
FFTW | 92 | 217 | 482 | 952 | 3002 | 5082 | 12005 | 31211 | 82769 | 183731
log2(N) | 16 | 17 | 18 | 19 | 20 | All times in
1 | 6.8 | 16 | 42 | 95 | 190 | milliseconds
FFTW | 83 | 180 | 560 | 670 | 1600 | 2 sig. figs.


*** API functions ***
Expand All @@ -49,7 +57,7 @@ log2(N) | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17

int mb Mailbox file descriptor obtained by calling mbox_open()

int log2_N log2(FFT length) = 8 to 17
int log2_N log2(FFT length) = 8 to 20

int direction FFT direction: GPU_FFT_FWD for forward FFT
GPU_FFT_REV for inverse FFT
Expand Down Expand Up @@ -127,3 +135,23 @@ waiting for rendering, call glFlush() and glFinish() afterwards as follows:
gpu_fft_execute(....); // blocking call
....
}


*** 2-dimensional FFT ***

Please study the hello_fft_2d demo source, which is built and executed thus:

make hello_fft_2d.bin
sudo ./hello_fft_2d.bin

This generates a Windows BMP file: "hello_fft_2d.bmp"

The demo uses a square 512x512 array; however, rectangular arrays are allowed.
The following lines in gpu_fft_trans.c will do what is safe:

ptr.arm.uptr[6] = src->x < dst->y? src->x : dst->y;
ptr.arm.uptr[7] = src->y < dst->x? src->y : dst->x;

One may transpose the output from the second FFT pass back into the first pass
input buffer, by preparing and executing a second transposition; however, this
is probably unnecessary. It depends on how the final output will be accessed.

0 comments on commit e45f94d

Please sign in to comment.