compiler: added PTX compilers using LLVM and Open64(nvopencc)

shinpei0208 · Nov 7, 2014 · 4b91fcf · 4b91fcf
1 parent e639547
commit 4b91fcf
Show file tree

Hide file tree

Showing 65 changed files with 599 additions and 0 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -14,6 +14,8 @@ OPTION(user "user mode(default off)" OFF)
 OPTION(runtime "enable CUDA runtime API(default on)" ON)
 OPTION(usched "enable user mode scheduler(default off)" OFF)
 OPTION(use_as "use assembler(default off)" OFF)
+OPTION(use_llvm "use LLVM compiler(default off)" OFF)
+OPTION(use_open64 "use Open64(nvopencc) compiler(default off)" OFF)
 
 MACRO(INCLUDE_DIRECTORY_IF_EXISTS DIR)
     IF(EXISTS "${DIR}/")
@@ -53,3 +55,11 @@ ENDIF(NOT user)
 IF(use_as)
     ADD_SUBDIRECTORY(compiler/as)
 ENDIF(use_as)
+
+IF(use_llvm)
+    ADD_SUBDIRECTORY(compiler/llvm)
+ENDIF(use_llvm)
+
+IF(use_open64)
+    ADD_SUBDIRECTORY(compiler/open64)
+ENDIF(use_open64)
diff --git a/compiler/llvm/CMakeLists.txt b/compiler/llvm/CMakeLists.txt
@@ -0,0 +1,42 @@
+EXECUTE_PROCESS(
+	COMMAND ${CMAKE_C_COMPILER} -dumpversion
+	OUTPUT_VARIABLE GCC_VERSION
+)
+if (GCC_VERSION VERSION_GREATER 4.7 OR GCC_VERSION VERSION_EQUAL 4.7)
+EXECUTE_PROCESS(
+	COMMAND svn co http://llvm.org/svn/llvm-project/llvm/trunk llvm
+	WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+)
+EXECUTE_PROCESS(
+	COMMAND svn co http://llvm.org/svn/llvm-project/cfe/trunk clang
+	WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/llvm/tools
+)
+EXECUTE_PROCESS(
+	COMMAND svn co http://llvm.org/svn/llvm-project/compiler-rt/trunk compiler-rt
+	WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/llvm/projects
+)
+EXECUTE_PROCESS(
+	COMMAND mkdir build
+	WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+)
+EXECUTE_PROCESS(
+	COMMAND cmake ${CMAKE_CURRENT_BINARY_DIR}/llvm
+	WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/build
+)
+ADD_CUSTOM_TARGET(llvm ALL make
+	WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/build
+)
+ADD_CUSTOM_TARGET(clean make clean
+	WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/build
+)
+INSTALL(
+	CODE "
+	EXECUTE_PROCESS(
+		COMMAND make install
+		WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/build
+	)
+	"
+)
+else()
+MESSAGE("skip: LLVM is required GCC version 4.7 or later.")
+endif()
diff --git a/compiler/open64/CMakeLists.txt b/compiler/open64/CMakeLists.txt
@@ -0,0 +1,44 @@
+FIND_PACKAGE(FLEX)
+FIND_PACKAGE(BISON)
+FIND_FILE(PERL4_CORELIBS getopts.pl PATHS /usr/lib/perl5 /usr/share/perl5)
+FIND_PROGRAM(TCSH tcsh PATHS /bin)
+EXECUTE_PROCESS(
+	COMMAND ${CMAKE_C_COMPILER} -dumpversion
+	OUTPUT_VARIABLE GCC_VERSION
+)
+if (GCC_VERSION VERSION_LESS 4.7)
+if(EXISTS ${PERL4_CORELIBS})
+if(EXISTS ${TCSH})
+EXECUTE_PROCESS(
+	COMMAND tar xzf ${CMAKE_CURRENT_SOURCE_DIR}/nvopencc_5.0_src_13604779.tar.gz
+	WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+)
+EXECUTE_PROCESS(
+	COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/nvopencc_5.0.patch
+	COMMAND patch -p0 --read-only=ignore
+	WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+)
+ADD_CUSTOM_TARGET(open64 ALL make
+	WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/src/targia3264_nvisa
+)
+ADD_CUSTOM_TARGET(clean make clean
+	WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/src/targia3264_nvisa
+)
+INSTALL(
+	PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/src/targia3264_nvisa/bin/nvopencc
+	DESTINATION gdev/open64/bin
+)
+FILE(GLOB libs "${CMAKE_CURRENT_BINARY_DIR}/src/targia3264_nvisa/lib/*")
+INSTALL(
+	PROGRAMS ${libs}
+	DESTINATION gdev/open64/lib
+)
+else()
+MESSAGE("skip: Open64 is required tcsh.")
+endif()
+else()
+MESSAGE("skip: Open64 is required perl4-corelibs package.")
+endif()
+else()
+MESSAGE("skip: Open64 is required GCC version 4.6 or older.")
+endif()
diff --git a/compiler/open64/nvopencc_5.0.patch b/compiler/open64/nvopencc_5.0.patch
@@ -0,0 +1,33 @@
+--- ./src/Makefile.gsetup.orig	2012-04-24 02:04:57.000000000 +0900
++++ ./src/Makefile.gsetup	2014-10-27 04:16:31.717636564 +0900
+@@ -531,7 +531,7 @@ ERROR_ON_WARNINGS = #-Werror
+ ifeq ($(BUILD_TARGET), NVISA)
+ # This is due to unsupported aliases in Mach-O
+ ifneq ($(BUILD_OS), DARWIN)
+-ERROR_ON_WARNINGS = -Werror
++ERROR_ON_WARNINGS = #-Werror
+ endif
+ endif
+
+--- ./src/gccfe/gnu/c-parse.y.orig	2012-03-16 04:52:30.000000000 +0900
++++ ./src/gccfe/gnu/c-parse.y	2014-10-27 04:50:16.125664019 +0900
+@@ -66,6 +66,7 @@ Software Foundation, 59 Temple Place - S
+ #include <locale.h>
+ #endif
+
++#include "y.tab.c"
+
+ /* Like YYERROR but do call yyerror.  */
+ #define YYERROR1 { yyerror ("syntax error"); YYERROR; }
+--- ./src/linux/make/gcommondefs.orig	2012-03-16 04:53:45.000000000 +0900
++++ ./src/linux/make/gcommondefs	2014-10-27 04:06:48.885628660 +0900
+@@ -135,7 +135,8 @@ ifeq ($(BUILD_COMPILER), GNU)
+     F90 +=
+   endif
+   ifeq ($(BUILD_TARGET), NVISA)
+-    ROOT_DIR := $(dir $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST)))/../../../..
++#    ROOT_DIR := $(dir $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST)))/../../../..
++    ROOT_DIR := $(dir $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST)))/../../..
+     ifneq ($(BUILD_OS), LINUX)
+       ifneq ($(BUILD_OS), DARWIN)
+         ifndef USE_NATIVE
diff --git a/compiler/open64/nvopencc_5.0_src_13604779.tar.gz b/compiler/open64/nvopencc_5.0_src_13604779.tar.gz
diff --git a/test/cuda/common/clang/cuda.h b/test/cuda/common/clang/cuda.h
@@ -0,0 +1,20 @@
+/* Minimal declarations for CUDA support.  Testing purposes only. */
+
+#include <stddef.h>
+
+#define __constant__ __attribute__((constant))
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+#define __host__ __attribute__((host))
+#define __shared__ __attribute__((shared))
+#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
+
+struct dim3 {
+  unsigned x, y, z;
+  __host__ __device__ dim3(unsigned x, unsigned y = 1, unsigned z = 1) : x(x), y(y), z(z) {}
+};
+
+typedef struct cudaStream *cudaStream_t;
+
+int cudaConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0,
+                      cudaStream_t stream = 0);
diff --git a/test/cuda/common/clang/float_gpu.cu b/test/cuda/common/clang/float_gpu.cu
@@ -0,0 +1,8 @@
+#include <stdint.h>
+#include "clang/cuda.h"
+extern "C"
+__global__
+void add(float a, float b, float *c)
+{
+	*c = a + b;
+}
diff --git a/test/cuda/common/clang/fmadd_gpu.cu b/test/cuda/common/clang/fmadd_gpu.cu
@@ -0,0 +1,14 @@
+#include <stdint.h>
+#include "clang/cuda.h"
+__global__
+void add(float *a, float *b, float *c, int n)
+{
+    int i = __builtin_ptx_read_ctaid_x() * __builtin_ptx_read_ntid_x()
+          + __builtin_ptx_read_tid_x();
+    int j = __builtin_ptx_read_ctaid_y() * __builtin_ptx_read_ntid_y()
+          + __builtin_ptx_read_tid_y();
+    if (i < n && j < n) {
+        int idx = i * n + j;
+        c[idx] = a[idx] + b[idx];
+    }
+}
diff --git a/test/cuda/common/clang/fmmul_gpu.cu b/test/cuda/common/clang/fmmul_gpu.cu
@@ -0,0 +1,14 @@
+#include <stdint.h>
+#include "clang/cuda.h"
+__global__
+void mul(float *a, float *b, float *c, int n)
+{
+    int i = __builtin_ptx_read_ctaid_x() * __builtin_ptx_read_ntid_x()
+          + __builtin_ptx_read_tid_x();
+    int j = __builtin_ptx_read_ctaid_y() * __builtin_ptx_read_ntid_y()
+          + __builtin_ptx_read_tid_y();
+    if (i < n && j < n) {
+        int idx = i * n + j;
+        c[idx] = a[idx] * b[idx];
+    }
+}
diff --git a/test/cuda/common/clang/idle_gpu.cu b/test/cuda/common/clang/idle_gpu.cu
@@ -0,0 +1,26 @@
+#include <stdint.h>
+#include "clang/cuda.h"
+
+extern "C"
+__global__
+void idle(unsigned int *p, unsigned int n)
+{
+    int x = __builtin_ptx_read_ctaid_x() * __builtin_ptx_read_ntid_x()
+          + __builtin_ptx_read_tid_x();
+    int y = __builtin_ptx_read_ctaid_y() * __builtin_ptx_read_ntid_y()
+          + __builtin_ptx_read_tid_y();
+	unsigned int i = 0, j = 0, k = 0;
+	__shared__ int s;
+
+	s = *p;
+	if (x == 0 && y == 0) {
+		for (i = 0; i < n; i++) {
+			if (x + y > n) {
+				s = s + x;
+				if (s > x + y)
+					s = x;
+			}
+		}
+	}
+	*p = s;
+}
diff --git a/test/cuda/common/clang/loop_gpu.cu b/test/cuda/common/clang/loop_gpu.cu
@@ -0,0 +1,13 @@
+#include <stdint.h>
+#include "clang/cuda.h"
+
+__global__
+void loop(uint32_t *data, uint32_t size, uint32_t n)
+{
+    int i;
+//	for (i = 0; i < n/40; i++) {
+	for (i = 0; i < n/5; i++) {
+		if (i * 4 < size)
+			data[i] = i + n;
+    }
+}
diff --git a/test/cuda/common/clang/madd_gpu.cu b/test/cuda/common/clang/madd_gpu.cu
@@ -0,0 +1,14 @@
+#include <stdint.h>
+#include "clang/cuda.h"
+__global__
+void add(uint32_t *a, uint32_t *b, uint32_t *c, uint32_t n)
+{
+    int i = __builtin_ptx_read_ctaid_x() * __builtin_ptx_read_ntid_x()
+          + __builtin_ptx_read_tid_x();
+    int j = __builtin_ptx_read_ctaid_y() * __builtin_ptx_read_ntid_y()
+          + __builtin_ptx_read_tid_y();
+    if (i < n && j < n) {
+        int idx = i * n + j;
+        c[idx] = a[idx] + b[idx];
+    }
+}
diff --git a/test/cuda/common/clang/mmul_gpu.cu b/test/cuda/common/clang/mmul_gpu.cu
@@ -0,0 +1,21 @@
+#include "clang/cuda.h"
+
+extern "C" __global__ void multiply(unsigned int *a, unsigned int *b, unsigned int *c,
+ int n)
+{
+	unsigned int i;
+	unsigned int product = 0;
+
+	int row = __builtin_ptx_read_ctaid_y() * __builtin_ptx_read_ntid_y()
+	        + __builtin_ptx_read_tid_y();
+	int col = __builtin_ptx_read_ctaid_x() * __builtin_ptx_read_ntid_x()
+	        + __builtin_ptx_read_tid_x();
+
+	if(row < n && col < n){
+	    for (i = 0; i < n; i++)
+		product += a[row * n + i] * b[i * n + col];
+
+	    c[row*n + col] = product;
+	}
+}
+
diff --git a/test/cuda/user/float/Makefile.llvm b/test/cuda/user/float/Makefile.llvm
@@ -0,0 +1,18 @@
+# Makefile
+TARGET	= user_test
+ARCH	= sm_20
+CC	= gcc
+CLANG	= clang -cc1 -nostdsysteminc -triple nvptx-unknown-unknown -fcuda-is-device -emit-llvm
+LLC     = llc -march=nvptx64 -mcpu=$(ARCH)
+PTXAS   = ptxas -arch $(ARCH)
+LIBS	= -lucuda -lgdev
+CFLAGS	= -L /usr/local/gdev/lib64 -I /usr/local/gdev/include
+
+all:
+	$(CLANG) -I . -o float_gpu.ll clang/float_gpu.cu
+	$(LLC) -o float_gpu.ptx float_gpu.ll
+	$(PTXAS) -o float_gpu.cubin float_gpu.ptx
+	$(CC) -o $(TARGET) $(CFLAGS) main.c float.c $(LIBS)
+
+clean:
+	rm -f $(TARGET) *.cubin *.ptx *.ll ./*~
diff --git a/test/cuda/user/float/clang/cuda.h b/test/cuda/user/float/clang/cuda.h
@@ -0,0 +1 @@
+../../../common/clang/cuda.h
diff --git a/test/cuda/user/float/clang/float_gpu.cu b/test/cuda/user/float/clang/float_gpu.cu
@@ -0,0 +1 @@
+../../../common/clang/float_gpu.cu
diff --git a/test/cuda/user/fmadd/Makefile.llvm b/test/cuda/user/fmadd/Makefile.llvm
@@ -0,0 +1,18 @@
+# Makefile
+TARGET	= user_test
+ARCH	= sm_20
+CC	= gcc
+CLANG	= clang -cc1 -nostdsysteminc -triple nvptx-unknown-unknown -fcuda-is-device -emit-llvm
+LLC     = llc -march=nvptx64 -mcpu=$(ARCH)
+PTXAS   = ptxas -arch $(ARCH)
+LIBS	= -lucuda -lgdev
+CFLAGS	= -L /usr/local/gdev/lib64 -I /usr/local/gdev/include
+
+all:
+	$(CLANG) -I . -o fmadd_gpu.ll clang/fmadd_gpu.cu
+	$(LLC) -o fmadd_gpu.ptx fmadd_gpu.ll
+	$(PTXAS) -o fmadd_gpu.cubin fmadd_gpu.ptx
+	$(CC) -o $(TARGET) $(CFLAGS) main.c fmadd.c $(LIBS)
+
+clean:
+	rm -f $(TARGET) *.cubin *.ptx *.ll ./*~
diff --git a/test/cuda/user/fmadd/clang/cuda.h b/test/cuda/user/fmadd/clang/cuda.h
@@ -0,0 +1 @@
+../../../common/clang/cuda.h
diff --git a/test/cuda/user/fmadd/clang/fmadd_gpu.cu b/test/cuda/user/fmadd/clang/fmadd_gpu.cu
@@ -0,0 +1 @@
+../../../common/clang/fmadd_gpu.cu
diff --git a/test/cuda/user/fmmul/Makefile.llvm b/test/cuda/user/fmmul/Makefile.llvm
@@ -0,0 +1,18 @@
+# Makefile
+TARGET	= user_test
+ARCH	= sm_20
+CC	= gcc
+CLANG	= clang -cc1 -nostdsysteminc -triple nvptx-unknown-unknown -fcuda-is-device -emit-llvm
+LLC     = llc -march=nvptx64 -mcpu=$(ARCH)
+PTXAS   = ptxas -arch $(ARCH)
+LIBS	= -lucuda -lgdev
+CFLAGS	= -L /usr/local/gdev/lib64 -I /usr/local/gdev/include
+
+all:
+	$(CLANG) -I . -o fmmul_gpu.ll clang/fmmul_gpu.cu
+	$(LLC) -o fmmul_gpu.ptx fmmul_gpu.ll
+	$(PTXAS) -o fmmul_gpu.cubin fmmul_gpu.ptx
+	$(CC) -o $(TARGET) $(CFLAGS) main.c fmmul.c $(LIBS)
+
+clean:
+	rm -f $(TARGET) *.cubin *.ptx *.ll ./*~
diff --git a/test/cuda/user/fmmul/clang/cuda.h b/test/cuda/user/fmmul/clang/cuda.h
@@ -0,0 +1 @@
+../../../common/clang/cuda.h
diff --git a/test/cuda/user/fmmul/clang/fmmul_gpu.cu b/test/cuda/user/fmmul/clang/fmmul_gpu.cu
@@ -0,0 +1 @@
+../../../common/clang/fmmul_gpu.cu
diff --git a/test/cuda/user/idle/Makefile.llvm b/test/cuda/user/idle/Makefile.llvm
@@ -0,0 +1,18 @@
+# Makefile
+TARGET	= user_test
+ARCH	= sm_20
+CC	= gcc
+CLANG	= clang -cc1 -nostdsysteminc -triple nvptx-unknown-unknown -fcuda-is-device -emit-llvm
+LLC     = llc -march=nvptx64 -mcpu=$(ARCH)
+PTXAS   = ptxas -arch $(ARCH)
+LIBS	= -lucuda -lgdev
+CFLAGS	= -L /usr/local/gdev/lib64 -I /usr/local/gdev/include
+
+all:
+	$(CLANG) -I . -o idle_gpu.ll clang/idle_gpu.cu
+	$(LLC) -o idle_gpu.ptx idle_gpu.ll
+	$(PTXAS) -o idle_gpu.cubin idle_gpu.ptx
+	$(CC) -o $(TARGET) $(CFLAGS) main.c idle.c $(LIBS)
+
+clean:
+	rm -f $(TARGET) *.cubin *.ptx *.ll ./*~
diff --git a/test/cuda/user/idle/clang/cuda.h b/test/cuda/user/idle/clang/cuda.h
@@ -0,0 +1 @@
+../../../common/clang/cuda.h
diff --git a/test/cuda/user/idle/clang/idle_gpu.cu b/test/cuda/user/idle/clang/idle_gpu.cu
@@ -0,0 +1 @@
+../../../common/clang/idle_gpu.cu
diff --git a/test/cuda/user/loop/Makefile.llvm b/test/cuda/user/loop/Makefile.llvm
@@ -0,0 +1,18 @@
+# Makefile
+TARGET	= user_test
+ARCH	= sm_20
+CC	= gcc
+CLANG	= clang -cc1 -nostdsysteminc -triple nvptx-unknown-unknown -fcuda-is-device -emit-llvm
+LLC     = llc -march=nvptx64 -mcpu=$(ARCH)
+PTXAS   = ptxas -arch $(ARCH)
+LIBS	= -lucuda -lgdev
+CFLAGS	= -L /usr/local/gdev/lib64 -I /usr/local/gdev/include
+
+all:
+	$(CLANG) -I . -o loop_gpu.ll clang/loop_gpu.cu
+	$(LLC) -o loop_gpu.ptx loop_gpu.ll
+	$(PTXAS) -o loop_gpu.cubin loop_gpu.ptx
+	$(CC) -o $(TARGET) $(CFLAGS) main.c loop.c $(LIBS)
+
+clean:
+	rm -f $(TARGET) *.cubin *.ptx *.ll ./*~