Added VFP support for ARM

sevagh · Apr 3, 2013 · 17aaf9d · 17aaf9d
1 parent 73a533c
commit 17aaf9d
Show file tree

Hide file tree

Showing 9 changed files with 669 additions and 40 deletions.
diff --git a/build_iphone.sh b/build_iphone.sh
@@ -4,7 +4,7 @@
 
 INSTALL_DIR="`pwd`/build"
 
-export SDKVER="6.0"
+export SDKVER="6.1"
 export DEVROOT="/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer"
 export SDKROOT="$DEVROOT/SDKs/iPhoneOS$SDKVER.sdk"
 export CFLAGS="-O3 -Wreturn-type -Wparentheses -Wswitch -Wno-unused-parameter -Wno-unused-variable -Wunused-value -Wno-shorten-64-to-32 -Wno-trigraphs -fpascal-strings -miphoneos-version-min=5.0 -mcpu=cortex-a9 -arch armv7 -mfpu=neon -pipe -isysroot $SDKROOT -isystem $SDKROOT/usr/include -isystem $DEVROOT/usr/include -mno-thumb -no-integrated-as"

diff --git a/src/Makefile.am b/src/Makefile.am
@@ -18,7 +18,7 @@ if HAVE_NEON
 if DYNAMIC_DISABLED
 libffts_la_SOURCES += neon_static_f.s neon_static_i.s
 else
-libffts_la_SOURCES += neon.s 
+libffts_la_SOURCES += neon.s vfp.s 
 endif
 
 else 

diff --git a/src/Makefile.in b/src/Makefile.in
@@ -54,7 +54,7 @@ host_triplet = @host@
 @DYNAMIC_DISABLED_TRUE@am__append_1 = ffts_static.c
 @DYNAMIC_DISABLED_FALSE@am__append_2 = codegen.c
 @DYNAMIC_DISABLED_TRUE@@HAVE_NEON_TRUE@am__append_3 = neon_static_f.s neon_static_i.s
-@DYNAMIC_DISABLED_FALSE@@HAVE_NEON_TRUE@am__append_4 = neon.s 
+@DYNAMIC_DISABLED_FALSE@@HAVE_NEON_TRUE@am__append_4 = neon.s vfp.s 
 @HAVE_NEON_FALSE@@HAVE_SSE_TRUE@am__append_5 = sse.s
 subdir = src
 DIST_COMMON = $(libffts_include_HEADERS) $(srcdir)/Makefile.am \
@@ -100,13 +100,13 @@ LTLIBRARIES = $(lib_LTLIBRARIES)
 libffts_la_LIBADD =
 am__libffts_la_SOURCES_DIST = ffts.c ffts_nd.c ffts_real.c \
 	ffts_real_nd.c patterns.c ffts_static.c codegen.c \
-	neon_static_f.s neon_static_i.s neon.s sse.s
+	neon_static_f.s neon_static_i.s neon.s vfp.s sse.s
 @DYNAMIC_DISABLED_TRUE@am__objects_1 = ffts_static.lo
 @DYNAMIC_DISABLED_FALSE@am__objects_2 = codegen.lo
 @DYNAMIC_DISABLED_TRUE@@HAVE_NEON_TRUE@am__objects_3 =  \
 @DYNAMIC_DISABLED_TRUE@@HAVE_NEON_TRUE@	neon_static_f.lo \
 @DYNAMIC_DISABLED_TRUE@@HAVE_NEON_TRUE@	neon_static_i.lo
-@DYNAMIC_DISABLED_FALSE@@HAVE_NEON_TRUE@am__objects_4 = neon.lo
+@DYNAMIC_DISABLED_FALSE@@HAVE_NEON_TRUE@am__objects_4 = neon.lo vfp.lo
 @HAVE_NEON_FALSE@@HAVE_SSE_TRUE@am__objects_5 = sse.lo
 am_libffts_la_OBJECTS = ffts.lo ffts_nd.lo ffts_real.lo \
 	ffts_real_nd.lo patterns.lo $(am__objects_1) $(am__objects_2) \
@@ -279,9 +279,9 @@ $(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
 	      exit 1;; \
 	  esac; \
 	done; \
-	echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign src/Makefile'; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu src/Makefile'; \
 	$(am__cd) $(top_srcdir) && \
-	  $(AUTOMAKE) --foreign src/Makefile
+	  $(AUTOMAKE) --gnu src/Makefile
 .PRECIOUS: Makefile
 Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
 	@case '$?' in \

diff --git a/src/codegen.c b/src/codegen.c
@@ -46,6 +46,7 @@
 	#include "codegen_neon.h"
 //	#include "neon_float.h"
 	#include "neon.h"
+	#include "vfp.h"
 #else
 	#include "codegen_sse.h"
 	#include "sse_float.h"
@@ -201,6 +202,7 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
 	}
 
 	insns_t *x_8_addr = fp;
+#ifdef __arm__
 #ifdef __ARM_NEON__
 	memcpy(fp, neon_x8, neon_x8_t - neon_x8);
 	if(sign < 0) {
@@ -209,6 +211,10 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
 		fp[97] ^= 0x00200000; fp[98] ^= 0x00200000; fp[102] ^= 0x00200000; fp[104] ^= 0x00200000;
 	}
 	fp += (neon_x8_t - neon_x8) / 4;
+#else
+	memcpy(fp, vfp_x8, vfp_end - vfp_x8);
+	fp += (vfp_end - vfp_x8) / 4;
+#endif
 #else
 	align_mem16(&fp, 0);
 	x_8_addr = fp;
@@ -221,12 +227,18 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
 //memcpy(fp, neon_x8_t, neon_end - neon_x8_t);
 //fp += (neon_end - neon_x8_t) / 4;
 	insns_t *x_4_addr = fp;
+#ifdef __arm__
+
 #ifdef __ARM_NEON__
 	memcpy(fp, neon_x4, neon_x8 - neon_x4);
 	if(sign < 0) {
 		fp[26] ^= 0x00200000; fp[28] ^= 0x00200000; fp[31] ^= 0x00200000; fp[32] ^= 0x00200000;
 	}
 	fp += (neon_x8 - neon_x4) / 4;
+#else
+	memcpy(fp, vfp_x4, vfp_x8 - vfp_x4);
+	fp += (vfp_x8 - vfp_x4) / 4;
+#endif
 #else
 	align_mem16(&fp, 0);
 	x_4_addr = fp;
@@ -257,9 +269,14 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
 #endif
 
 
-#ifdef __ARM_NEON__
-	*fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++; 
-	MOVI(&fp, 11, p->i0);
+#ifdef __arm__
+		*fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++; 
+  	#ifdef __ARM_NEON__
+  		MOVI(&fp, 11, p->i0);
+  	#else 
+  		MOVI(&fp, 11, p->i0);
+  	#endif
+
 #else
 	align_mem16(&fp, 0);
 	start = fp;
@@ -273,14 +290,19 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
 	//LEA(&fp, R8, RDI, ((uint32_t)&p->offsets) - ((uint32_t)p)); 
 #endif
 	//fp++;
+#ifdef __arm__
 #ifdef __ARM_NEON__
 	memcpy(fp, neon_ee, neon_oo - neon_ee);
-  if(sign < 0) {
-  	fp[33] ^= 0x00200000; fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[39] ^= 0x00200000;
-  	fp[40] ^= 0x00200000; fp[41] ^= 0x00200000; fp[44] ^= 0x00200000; fp[45] ^= 0x00200000;
-  	fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
-  }
+	if(sign < 0) {
+		fp[33] ^= 0x00200000; fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[39] ^= 0x00200000;
+		fp[40] ^= 0x00200000; fp[41] ^= 0x00200000; fp[44] ^= 0x00200000; fp[45] ^= 0x00200000;
+		fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
+	}
 	fp += (neon_oo - neon_ee) / 4;
+#else
+		memcpy(fp, vfp_e, vfp_o - vfp_e);
+		fp += (vfp_o - vfp_e) / 4;
+#endif
 #else
 //fprintf(stderr, "Body start address = %016p\n", start);
 
@@ -403,14 +425,14 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
 
 
   	if(pps[0] == 2*leafN) {
-      CALL(&fp, x_4_addr);
+   //   CALL(&fp, x_4_addr);
 //  	}else if(!pps[2]){
 //	  //uint32_t *x_8_t_addr = fp;
 //		memcpy(fp, neon_x8_t, neon_ee - neon_x8_t);
 //		fp += (neon_ee - neon_x8_t) / 4;
 //		//*fp++ = BL(fp+2, x_8_t_addr);
   	}else{
-    		CALL(&fp, x_8_addr);
+    //		CALL(&fp, x_8_addr);
   	}
 
 		pAddr = pps[1] * 4;
@@ -422,6 +444,7 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
 		pps += 2;
 	}
 #endif
+#ifdef __arm__
 #ifdef __ARM_NEON__
 	if(__builtin_ctzl(N) & 1){
 		ADDI(&fp, 2, 7, 0);
@@ -519,7 +542,45 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
 		fp += (neon_oo - neon_ee) / 4;
 
 	}
+#else
+		ADDI(&fp, 2, 7, 0);
+		ADDI(&fp, 7, 9, 0);
+		ADDI(&fp, 9, 2, 0);
+
+		ADDI(&fp, 2, 8, 0);
+		ADDI(&fp, 8, 10, 0);
+		ADDI(&fp, 10, 2, 0);
 
+			MOVI(&fp, 11, (p->i1>0) ? p->i1 : 1);
+  		memcpy(fp, vfp_o, vfp_x4 - vfp_o);
+  		fp += (vfp_x4 - vfp_o) / 4;
+
+		ADDI(&fp, 2, 3, 0);
+		ADDI(&fp, 3, 7, 0);
+		ADDI(&fp, 7, 2, 0);
+
+		ADDI(&fp, 2, 4, 0);
+		ADDI(&fp, 4, 8, 0);
+		ADDI(&fp, 8, 2, 0);
+
+		ADDI(&fp, 2, 5, 0);
+		ADDI(&fp, 5, 9, 0);
+		ADDI(&fp, 9, 2, 0);
+
+		ADDI(&fp, 2, 6, 0);
+		ADDI(&fp, 6, 10, 0);
+		ADDI(&fp, 10, 2, 0);
+
+		ADDI(&fp, 2, 9, 0);
+		ADDI(&fp, 9, 10, 0);
+		ADDI(&fp, 10, 2, 0);
+
+		*fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++; 
+	  MOVI(&fp, 11, (p->i2>0) ? p->i2 : 1);
+  	memcpy(fp, vfp_e, vfp_o - vfp_e);
+  	fp += (vfp_o - vfp_e) / 4;
+
+#endif
   *fp = LDRI(2, 1, ((uint32_t)&p->ws) - ((uint32_t)p)); fp++; // load offsets into r12
 	//ADDI(&fp, 2, 1, 0);
 	MOVI(&fp, 1, 0);
@@ -551,6 +612,7 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
       *fp = BL(fp+2, x_4_addr); fp++;
   	}else if(!pps[2]){
   	  //uint32_t *x_8_t_addr = fp;
+#ifdef __ARM_NEON__
   		memcpy(fp, neon_x8_t, neon_ee - neon_x8_t);
   		if(sign < 0) {
   			fp[31] ^= 0x00200000; fp[32] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000;
@@ -559,6 +621,10 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
   		}
   		fp += (neon_ee - neon_x8_t) / 4;
   		//*fp++ = BL(fp+2, x_8_t_addr);
+
+#else
+  		*fp = BL(fp+2, x_8_addr); fp++;
+#endif
   	}else{
   		*fp = BL(fp+2, x_8_addr); fp++;
   	}
@@ -612,7 +678,7 @@ void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
   	exit(1);
   }
 #ifdef __APPLE__
-//	sys_icache_invalidate(func, p->transform_size);
+	sys_icache_invalidate(func, p->transform_size);
 #elif __ANDROID__
 	cacheflush((long)(func), (long)(func) + p->transform_size, 0);
 #elif __linux__

diff --git a/src/ffts.c b/src/ffts.c
@@ -104,13 +104,30 @@ ffts_plan_t *ffts_init_1d(size_t N, int sign) {
 
 	if(N >= 32) {
 		ffts_init_offsets(p, N, leafN);
-		ffts_init_is(p, N, leafN, 2);
+#ifdef __arm__
+#ifdef __ARM_NEON__
+		ffts_init_is(p, N, leafN, 1);
+#else
+		ffts_init_is(p, N, leafN, 1);
+#endif
+#else
+		ffts_init_is(p, N, leafN, 1);
+#endif
 
 		p->i0 = N/leafN/3+1;
 		p->i1 = N/leafN/3;
 		if((N/leafN) % 3 > 1) p->i1++;
-		p->i0/=2;
-		p->i1/=2;
+		p->i2 = N/leafN/3;
+
+	#ifdef __arm__	
+	#ifdef __ARM_NEON__
+	p->i0/=2;
+	p->i1/=2;
+	#endif
+	#else
+	p->i0/=2;
+	p->i1/=2;
+	#endif
 
 	}else{
 		p->transforms = malloc(2 * sizeof(transform_index_t));
@@ -198,7 +215,7 @@ ffts_plan_t *ffts_init_1d(size_t N, int sign) {
 
 
 				float *fw0 = (float *)w0;
-				#ifdef __ARM_NEON__
+				#ifdef __arm__
 				if(N < 32) {
 					//w = FFTS_MALLOC(n/4 * 2 * sizeof(cdata_t), 32);
 					float *fw = (float *)w;
@@ -217,11 +234,18 @@ ffts_plan_t *ffts_init_1d(size_t N, int sign) {
 					//w = FFTS_MALLOC(n/4 * sizeof(cdata_t), 32);
 					float *fw = (float *)w;
 					VS temp0, temp1, temp2;
+  				#ifdef __ARM_NEON__
 					for(j=0;j<n/4;j+=4) {
-						temp0 = VLD2(fw0 + j*2);
-						temp0.val[1] = VXOR(temp0.val[1], neg);
-						STORESPR(fw + j*2, temp0);
+  					temp0 = VLD2(fw0 + j*2);
+  					temp0.val[1] = VXOR(temp0.val[1], neg);
+  					STORESPR(fw + j*2, temp0);
+  				}
+					#else
+  				for(j=0;j<n/4;j+=1) {
+  					fw[j*2] = fw0[j*2];
+  					fw[j*2+1] = fw0[j*2+1];
 					}
+					#endif
 					w += n/4;
 				}
 				#else
@@ -261,22 +285,32 @@ ffts_plan_t *ffts_init_1d(size_t N, int sign) {
 				float *fw0 = (float *)w0;
 				float *fw1 = (float *)w1;
 				float *fw2 = (float *)w2;
-				#ifdef __ARM_NEON__
+				#ifdef __arm__
 				//w = FFTS_MALLOC(n/8 * 3 * sizeof(cdata_t), 32);
 				float *fw = (float *)w;
 				VS temp0, temp1, temp2;
-
-				for(j=0;j<n/8;j+=4) {
-					temp0 = VLD2(fw0 + j*2);
-					temp0.val[1] = VXOR(temp0.val[1], neg);
-					STORESPR(fw + j*2*3,      temp0);
-					temp1 = VLD2(fw1 + j*2);
-					temp1.val[1] = VXOR(temp1.val[1], neg);
-					STORESPR(fw + j*2*3 + 8,  temp1);
-					temp2 = VLD2(fw2 + j*2);
-					temp2.val[1] = VXOR(temp2.val[1], neg);
-					STORESPR(fw + j*2*3 + 16, temp2);
-				}
+				#ifdef __ARM_NEON__	
+  			for(j=0;j<n/8;j+=4) {
+  				temp0 = VLD2(fw0 + j*2);
+  				temp0.val[1] = VXOR(temp0.val[1], neg);
+  				STORESPR(fw + j*2*3,      temp0);
+  				temp1 = VLD2(fw1 + j*2);
+  				temp1.val[1] = VXOR(temp1.val[1], neg);
+  				STORESPR(fw + j*2*3 + 8,  temp1);
+  				temp2 = VLD2(fw2 + j*2);
+  				temp2.val[1] = VXOR(temp2.val[1], neg);
+  				STORESPR(fw + j*2*3 + 16, temp2);
+  			}
+				#else
+  			for(j=0;j<n/8;j+=1) {
+  					fw[j*6] = fw0[j*2];
+  					fw[j*6+1] = fw0[j*2+1];
+  					fw[j*6+2] = fw1[j*2+0];
+  					fw[j*6+3] = fw1[j*2+1];
+  					fw[j*6+4] = fw2[j*2+0];
+  					fw[j*6+5] = fw2[j*2+1];
+  			}
+				#endif
 				w += n/8 * 3;
 				#else
 				//w = FFTS_MALLOC(n/8 * 3 * 2 * sizeof(cdata_t), 32);

diff --git a/src/ffts.h b/src/ffts.h
@@ -94,6 +94,7 @@ struct _ffts_plan_t {
 	void (*destroy)(ffts_plan_t *);
 
 	float *A, *B;
+	size_t i2;
 };