Skip to content

Commit

Permalink
Added VFP support for ARM
Browse files Browse the repository at this point in the history
  • Loading branch information
anthonix committed Apr 3, 2013
1 parent 73a533c commit 17aaf9d
Show file tree
Hide file tree
Showing 9 changed files with 669 additions and 40 deletions.
2 changes: 1 addition & 1 deletion build_iphone.sh
Expand Up @@ -4,7 +4,7 @@

INSTALL_DIR="`pwd`/build"

export SDKVER="6.0"
export SDKVER="6.1"
export DEVROOT="/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer"
export SDKROOT="$DEVROOT/SDKs/iPhoneOS$SDKVER.sdk"
export CFLAGS="-O3 -Wreturn-type -Wparentheses -Wswitch -Wno-unused-parameter -Wno-unused-variable -Wunused-value -Wno-shorten-64-to-32 -Wno-trigraphs -fpascal-strings -miphoneos-version-min=5.0 -mcpu=cortex-a9 -arch armv7 -mfpu=neon -pipe -isysroot $SDKROOT -isystem $SDKROOT/usr/include -isystem $DEVROOT/usr/include -mno-thumb -no-integrated-as"
Expand Down
2 changes: 1 addition & 1 deletion src/Makefile.am
Expand Up @@ -18,7 +18,7 @@ if HAVE_NEON
if DYNAMIC_DISABLED
libffts_la_SOURCES += neon_static_f.s neon_static_i.s
else
libffts_la_SOURCES += neon.s
libffts_la_SOURCES += neon.s vfp.s
endif

else
Expand Down
10 changes: 5 additions & 5 deletions src/Makefile.in
Expand Up @@ -54,7 +54,7 @@ host_triplet = @host@
@DYNAMIC_DISABLED_TRUE@am__append_1 = ffts_static.c
@DYNAMIC_DISABLED_FALSE@am__append_2 = codegen.c
@DYNAMIC_DISABLED_TRUE@@HAVE_NEON_TRUE@am__append_3 = neon_static_f.s neon_static_i.s
@DYNAMIC_DISABLED_FALSE@@HAVE_NEON_TRUE@am__append_4 = neon.s
@DYNAMIC_DISABLED_FALSE@@HAVE_NEON_TRUE@am__append_4 = neon.s vfp.s
@HAVE_NEON_FALSE@@HAVE_SSE_TRUE@am__append_5 = sse.s
subdir = src
DIST_COMMON = $(libffts_include_HEADERS) $(srcdir)/Makefile.am \
Expand Down Expand Up @@ -100,13 +100,13 @@ LTLIBRARIES = $(lib_LTLIBRARIES)
libffts_la_LIBADD =
am__libffts_la_SOURCES_DIST = ffts.c ffts_nd.c ffts_real.c \
ffts_real_nd.c patterns.c ffts_static.c codegen.c \
neon_static_f.s neon_static_i.s neon.s sse.s
neon_static_f.s neon_static_i.s neon.s vfp.s sse.s
@DYNAMIC_DISABLED_TRUE@am__objects_1 = ffts_static.lo
@DYNAMIC_DISABLED_FALSE@am__objects_2 = codegen.lo
@DYNAMIC_DISABLED_TRUE@@HAVE_NEON_TRUE@am__objects_3 = \
@DYNAMIC_DISABLED_TRUE@@HAVE_NEON_TRUE@ neon_static_f.lo \
@DYNAMIC_DISABLED_TRUE@@HAVE_NEON_TRUE@ neon_static_i.lo
@DYNAMIC_DISABLED_FALSE@@HAVE_NEON_TRUE@am__objects_4 = neon.lo
@DYNAMIC_DISABLED_FALSE@@HAVE_NEON_TRUE@am__objects_4 = neon.lo vfp.lo
@HAVE_NEON_FALSE@@HAVE_SSE_TRUE@am__objects_5 = sse.lo
am_libffts_la_OBJECTS = ffts.lo ffts_nd.lo ffts_real.lo \
ffts_real_nd.lo patterns.lo $(am__objects_1) $(am__objects_2) \
Expand Down Expand Up @@ -279,9 +279,9 @@ $(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps)
exit 1;; \
esac; \
done; \
echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign src/Makefile'; \
echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu src/Makefile'; \
$(am__cd) $(top_srcdir) && \
$(AUTOMAKE) --foreign src/Makefile
$(AUTOMAKE) --gnu src/Makefile
.PRECIOUS: Makefile
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
@case '$?' in \
Expand Down
88 changes: 77 additions & 11 deletions src/codegen.c
Expand Up @@ -46,6 +46,7 @@
#include "codegen_neon.h"
// #include "neon_float.h"
#include "neon.h"
#include "vfp.h"
#else
#include "codegen_sse.h"
#include "sse_float.h"
Expand Down Expand Up @@ -201,6 +202,7 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
}

insns_t *x_8_addr = fp;
#ifdef __arm__
#ifdef __ARM_NEON__
memcpy(fp, neon_x8, neon_x8_t - neon_x8);
if(sign < 0) {
Expand All @@ -209,6 +211,10 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
fp[97] ^= 0x00200000; fp[98] ^= 0x00200000; fp[102] ^= 0x00200000; fp[104] ^= 0x00200000;
}
fp += (neon_x8_t - neon_x8) / 4;
#else
memcpy(fp, vfp_x8, vfp_end - vfp_x8);
fp += (vfp_end - vfp_x8) / 4;
#endif
#else
align_mem16(&fp, 0);
x_8_addr = fp;
Expand All @@ -221,12 +227,18 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
//memcpy(fp, neon_x8_t, neon_end - neon_x8_t);
//fp += (neon_end - neon_x8_t) / 4;
insns_t *x_4_addr = fp;
#ifdef __arm__

#ifdef __ARM_NEON__
memcpy(fp, neon_x4, neon_x8 - neon_x4);
if(sign < 0) {
fp[26] ^= 0x00200000; fp[28] ^= 0x00200000; fp[31] ^= 0x00200000; fp[32] ^= 0x00200000;
}
fp += (neon_x8 - neon_x4) / 4;
#else
memcpy(fp, vfp_x4, vfp_x8 - vfp_x4);
fp += (vfp_x8 - vfp_x4) / 4;
#endif
#else
align_mem16(&fp, 0);
x_4_addr = fp;
Expand Down Expand Up @@ -257,9 +269,14 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
#endif


#ifdef __ARM_NEON__
*fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++;
MOVI(&fp, 11, p->i0);
#ifdef __arm__
*fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++;
#ifdef __ARM_NEON__
MOVI(&fp, 11, p->i0);
#else
MOVI(&fp, 11, p->i0);
#endif

#else
align_mem16(&fp, 0);
start = fp;
Expand All @@ -273,14 +290,19 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
//LEA(&fp, R8, RDI, ((uint32_t)&p->offsets) - ((uint32_t)p));
#endif
//fp++;
#ifdef __arm__
#ifdef __ARM_NEON__
memcpy(fp, neon_ee, neon_oo - neon_ee);
if(sign < 0) {
fp[33] ^= 0x00200000; fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[39] ^= 0x00200000;
fp[40] ^= 0x00200000; fp[41] ^= 0x00200000; fp[44] ^= 0x00200000; fp[45] ^= 0x00200000;
fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
}
if(sign < 0) {
fp[33] ^= 0x00200000; fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[39] ^= 0x00200000;
fp[40] ^= 0x00200000; fp[41] ^= 0x00200000; fp[44] ^= 0x00200000; fp[45] ^= 0x00200000;
fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
}
fp += (neon_oo - neon_ee) / 4;
#else
memcpy(fp, vfp_e, vfp_o - vfp_e);
fp += (vfp_o - vfp_e) / 4;
#endif
#else
//fprintf(stderr, "Body start address = %016p\n", start);

Expand Down Expand Up @@ -403,14 +425,14 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {


if(pps[0] == 2*leafN) {
CALL(&fp, x_4_addr);
// CALL(&fp, x_4_addr);
// }else if(!pps[2]){
// //uint32_t *x_8_t_addr = fp;
// memcpy(fp, neon_x8_t, neon_ee - neon_x8_t);
// fp += (neon_ee - neon_x8_t) / 4;
// //*fp++ = BL(fp+2, x_8_t_addr);
}else{
CALL(&fp, x_8_addr);
// CALL(&fp, x_8_addr);
}

pAddr = pps[1] * 4;
Expand All @@ -422,6 +444,7 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
pps += 2;
}
#endif
#ifdef __arm__
#ifdef __ARM_NEON__
if(__builtin_ctzl(N) & 1){
ADDI(&fp, 2, 7, 0);
Expand Down Expand Up @@ -519,7 +542,45 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
fp += (neon_oo - neon_ee) / 4;

}
#else
ADDI(&fp, 2, 7, 0);
ADDI(&fp, 7, 9, 0);
ADDI(&fp, 9, 2, 0);

ADDI(&fp, 2, 8, 0);
ADDI(&fp, 8, 10, 0);
ADDI(&fp, 10, 2, 0);

MOVI(&fp, 11, (p->i1>0) ? p->i1 : 1);
memcpy(fp, vfp_o, vfp_x4 - vfp_o);
fp += (vfp_x4 - vfp_o) / 4;

ADDI(&fp, 2, 3, 0);
ADDI(&fp, 3, 7, 0);
ADDI(&fp, 7, 2, 0);

ADDI(&fp, 2, 4, 0);
ADDI(&fp, 4, 8, 0);
ADDI(&fp, 8, 2, 0);

ADDI(&fp, 2, 5, 0);
ADDI(&fp, 5, 9, 0);
ADDI(&fp, 9, 2, 0);

ADDI(&fp, 2, 6, 0);
ADDI(&fp, 6, 10, 0);
ADDI(&fp, 10, 2, 0);

ADDI(&fp, 2, 9, 0);
ADDI(&fp, 9, 10, 0);
ADDI(&fp, 10, 2, 0);

*fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++;
MOVI(&fp, 11, (p->i2>0) ? p->i2 : 1);
memcpy(fp, vfp_e, vfp_o - vfp_e);
fp += (vfp_o - vfp_e) / 4;

#endif
*fp = LDRI(2, 1, ((uint32_t)&p->ws) - ((uint32_t)p)); fp++; // load offsets into r12
//ADDI(&fp, 2, 1, 0);
MOVI(&fp, 1, 0);
Expand Down Expand Up @@ -551,6 +612,7 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
*fp = BL(fp+2, x_4_addr); fp++;
}else if(!pps[2]){
//uint32_t *x_8_t_addr = fp;
#ifdef __ARM_NEON__
memcpy(fp, neon_x8_t, neon_ee - neon_x8_t);
if(sign < 0) {
fp[31] ^= 0x00200000; fp[32] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000;
Expand All @@ -559,6 +621,10 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
}
fp += (neon_ee - neon_x8_t) / 4;
//*fp++ = BL(fp+2, x_8_t_addr);

#else
*fp = BL(fp+2, x_8_addr); fp++;
#endif
}else{
*fp = BL(fp+2, x_8_addr); fp++;
}
Expand Down Expand Up @@ -612,7 +678,7 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
exit(1);
}
#ifdef __APPLE__
// sys_icache_invalidate(func, p->transform_size);
sys_icache_invalidate(func, p->transform_size);
#elif __ANDROID__
cacheflush((long)(func), (long)(func) + p->transform_size, 0);
#elif __linux__
Expand Down
74 changes: 54 additions & 20 deletions src/ffts.c
Expand Up @@ -104,13 +104,30 @@ ffts_plan_t *ffts_init_1d(size_t N, int sign) {

if(N >= 32) {
ffts_init_offsets(p, N, leafN);
ffts_init_is(p, N, leafN, 2);
#ifdef __arm__
#ifdef __ARM_NEON__
ffts_init_is(p, N, leafN, 1);
#else
ffts_init_is(p, N, leafN, 1);
#endif
#else
ffts_init_is(p, N, leafN, 1);
#endif

p->i0 = N/leafN/3+1;
p->i1 = N/leafN/3;
if((N/leafN) % 3 > 1) p->i1++;
p->i0/=2;
p->i1/=2;
p->i2 = N/leafN/3;

#ifdef __arm__
#ifdef __ARM_NEON__
p->i0/=2;
p->i1/=2;
#endif
#else
p->i0/=2;
p->i1/=2;
#endif

}else{
p->transforms = malloc(2 * sizeof(transform_index_t));
Expand Down Expand Up @@ -198,7 +215,7 @@ ffts_plan_t *ffts_init_1d(size_t N, int sign) {


float *fw0 = (float *)w0;
#ifdef __ARM_NEON__
#ifdef __arm__
if(N < 32) {
//w = FFTS_MALLOC(n/4 * 2 * sizeof(cdata_t), 32);
float *fw = (float *)w;
Expand All @@ -217,11 +234,18 @@ ffts_plan_t *ffts_init_1d(size_t N, int sign) {
//w = FFTS_MALLOC(n/4 * sizeof(cdata_t), 32);
float *fw = (float *)w;
VS temp0, temp1, temp2;
#ifdef __ARM_NEON__
for(j=0;j<n/4;j+=4) {
temp0 = VLD2(fw0 + j*2);
temp0.val[1] = VXOR(temp0.val[1], neg);
STORESPR(fw + j*2, temp0);
temp0 = VLD2(fw0 + j*2);
temp0.val[1] = VXOR(temp0.val[1], neg);
STORESPR(fw + j*2, temp0);
}
#else
for(j=0;j<n/4;j+=1) {
fw[j*2] = fw0[j*2];
fw[j*2+1] = fw0[j*2+1];
}
#endif
w += n/4;
}
#else
Expand Down Expand Up @@ -261,22 +285,32 @@ ffts_plan_t *ffts_init_1d(size_t N, int sign) {
float *fw0 = (float *)w0;
float *fw1 = (float *)w1;
float *fw2 = (float *)w2;
#ifdef __ARM_NEON__
#ifdef __arm__
//w = FFTS_MALLOC(n/8 * 3 * sizeof(cdata_t), 32);
float *fw = (float *)w;
VS temp0, temp1, temp2;

for(j=0;j<n/8;j+=4) {
temp0 = VLD2(fw0 + j*2);
temp0.val[1] = VXOR(temp0.val[1], neg);
STORESPR(fw + j*2*3, temp0);
temp1 = VLD2(fw1 + j*2);
temp1.val[1] = VXOR(temp1.val[1], neg);
STORESPR(fw + j*2*3 + 8, temp1);
temp2 = VLD2(fw2 + j*2);
temp2.val[1] = VXOR(temp2.val[1], neg);
STORESPR(fw + j*2*3 + 16, temp2);
}
#ifdef __ARM_NEON__
for(j=0;j<n/8;j+=4) {
temp0 = VLD2(fw0 + j*2);
temp0.val[1] = VXOR(temp0.val[1], neg);
STORESPR(fw + j*2*3, temp0);
temp1 = VLD2(fw1 + j*2);
temp1.val[1] = VXOR(temp1.val[1], neg);
STORESPR(fw + j*2*3 + 8, temp1);
temp2 = VLD2(fw2 + j*2);
temp2.val[1] = VXOR(temp2.val[1], neg);
STORESPR(fw + j*2*3 + 16, temp2);
}
#else
for(j=0;j<n/8;j+=1) {
fw[j*6] = fw0[j*2];
fw[j*6+1] = fw0[j*2+1];
fw[j*6+2] = fw1[j*2+0];
fw[j*6+3] = fw1[j*2+1];
fw[j*6+4] = fw2[j*2+0];
fw[j*6+5] = fw2[j*2+1];
}
#endif
w += n/8 * 3;
#else
//w = FFTS_MALLOC(n/8 * 3 * 2 * sizeof(cdata_t), 32);
Expand Down
1 change: 1 addition & 0 deletions src/ffts.h
Expand Up @@ -94,6 +94,7 @@ struct _ffts_plan_t {
void (*destroy)(ffts_plan_t *);

float *A, *B;
size_t i2;
};


Expand Down

0 comments on commit 17aaf9d

Please sign in to comment.