Skip to content

Commit

Permalink
compiler: added PTX compilers using LLVM and Open64(nvopencc)
Browse files Browse the repository at this point in the history
  • Loading branch information
m-iwata committed Nov 7, 2014
1 parent e639547 commit 4b91fcf
Show file tree
Hide file tree
Showing 65 changed files with 599 additions and 0 deletions.
10 changes: 10 additions & 0 deletions CMakeLists.txt
Expand Up @@ -14,6 +14,8 @@ OPTION(user "user mode(default off)" OFF)
OPTION(runtime "enable CUDA runtime API(default on)" ON)
OPTION(usched "enable user mode scheduler(default off)" OFF)
OPTION(use_as "use assembler(default off)" OFF)
OPTION(use_llvm "use LLVM compiler(default off)" OFF)
OPTION(use_open64 "use Open64(nvopencc) compiler(default off)" OFF)

MACRO(INCLUDE_DIRECTORY_IF_EXISTS DIR)
IF(EXISTS "${DIR}/")
Expand Down Expand Up @@ -53,3 +55,11 @@ ENDIF(NOT user)
IF(use_as)
ADD_SUBDIRECTORY(compiler/as)
ENDIF(use_as)

IF(use_llvm)
ADD_SUBDIRECTORY(compiler/llvm)
ENDIF(use_llvm)

IF(use_open64)
ADD_SUBDIRECTORY(compiler/open64)
ENDIF(use_open64)
42 changes: 42 additions & 0 deletions compiler/llvm/CMakeLists.txt
@@ -0,0 +1,42 @@
EXECUTE_PROCESS(
COMMAND ${CMAKE_C_COMPILER} -dumpversion
OUTPUT_VARIABLE GCC_VERSION
)
if (GCC_VERSION VERSION_GREATER 4.7 OR GCC_VERSION VERSION_EQUAL 4.7)
EXECUTE_PROCESS(
COMMAND svn co http://llvm.org/svn/llvm-project/llvm/trunk llvm
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
)
EXECUTE_PROCESS(
COMMAND svn co http://llvm.org/svn/llvm-project/cfe/trunk clang
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/llvm/tools
)
EXECUTE_PROCESS(
COMMAND svn co http://llvm.org/svn/llvm-project/compiler-rt/trunk compiler-rt
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/llvm/projects
)
EXECUTE_PROCESS(
COMMAND mkdir build
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
)
EXECUTE_PROCESS(
COMMAND cmake ${CMAKE_CURRENT_BINARY_DIR}/llvm
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/build
)
ADD_CUSTOM_TARGET(llvm ALL make
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/build
)
ADD_CUSTOM_TARGET(clean make clean
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/build
)
INSTALL(
CODE "
EXECUTE_PROCESS(
COMMAND make install
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/build
)
"
)
else()
MESSAGE("skip: LLVM is required GCC version 4.7 or later.")
endif()
44 changes: 44 additions & 0 deletions compiler/open64/CMakeLists.txt
@@ -0,0 +1,44 @@
FIND_PACKAGE(FLEX)
FIND_PACKAGE(BISON)
FIND_FILE(PERL4_CORELIBS getopts.pl PATHS /usr/lib/perl5 /usr/share/perl5)
FIND_PROGRAM(TCSH tcsh PATHS /bin)
EXECUTE_PROCESS(
COMMAND ${CMAKE_C_COMPILER} -dumpversion
OUTPUT_VARIABLE GCC_VERSION
)
if (GCC_VERSION VERSION_LESS 4.7)
if(EXISTS ${PERL4_CORELIBS})
if(EXISTS ${TCSH})
EXECUTE_PROCESS(
COMMAND tar xzf ${CMAKE_CURRENT_SOURCE_DIR}/nvopencc_5.0_src_13604779.tar.gz
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
)
EXECUTE_PROCESS(
COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/nvopencc_5.0.patch
COMMAND patch -p0 --read-only=ignore
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
)
ADD_CUSTOM_TARGET(open64 ALL make
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/src/targia3264_nvisa
)
ADD_CUSTOM_TARGET(clean make clean
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/src/targia3264_nvisa
)
INSTALL(
PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/src/targia3264_nvisa/bin/nvopencc
DESTINATION gdev/open64/bin
)
FILE(GLOB libs "${CMAKE_CURRENT_BINARY_DIR}/src/targia3264_nvisa/lib/*")
INSTALL(
PROGRAMS ${libs}
DESTINATION gdev/open64/lib
)
else()
MESSAGE("skip: Open64 is required tcsh.")
endif()
else()
MESSAGE("skip: Open64 is required perl4-corelibs package.")
endif()
else()
MESSAGE("skip: Open64 is required GCC version 4.6 or older.")
endif()
33 changes: 33 additions & 0 deletions compiler/open64/nvopencc_5.0.patch
@@ -0,0 +1,33 @@
--- ./src/Makefile.gsetup.orig 2012-04-24 02:04:57.000000000 +0900
+++ ./src/Makefile.gsetup 2014-10-27 04:16:31.717636564 +0900
@@ -531,7 +531,7 @@ ERROR_ON_WARNINGS = #-Werror
ifeq ($(BUILD_TARGET), NVISA)
# This is due to unsupported aliases in Mach-O
ifneq ($(BUILD_OS), DARWIN)
-ERROR_ON_WARNINGS = -Werror
+ERROR_ON_WARNINGS = #-Werror
endif
endif

--- ./src/gccfe/gnu/c-parse.y.orig 2012-03-16 04:52:30.000000000 +0900
+++ ./src/gccfe/gnu/c-parse.y 2014-10-27 04:50:16.125664019 +0900
@@ -66,6 +66,7 @@ Software Foundation, 59 Temple Place - S
#include <locale.h>
#endif

+#include "y.tab.c"

/* Like YYERROR but do call yyerror. */
#define YYERROR1 { yyerror ("syntax error"); YYERROR; }
--- ./src/linux/make/gcommondefs.orig 2012-03-16 04:53:45.000000000 +0900
+++ ./src/linux/make/gcommondefs 2014-10-27 04:06:48.885628660 +0900
@@ -135,7 +135,8 @@ ifeq ($(BUILD_COMPILER), GNU)
F90 +=
endif
ifeq ($(BUILD_TARGET), NVISA)
- ROOT_DIR := $(dir $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST)))/../../../..
+# ROOT_DIR := $(dir $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST)))/../../../..
+ ROOT_DIR := $(dir $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST)))/../../..
ifneq ($(BUILD_OS), LINUX)
ifneq ($(BUILD_OS), DARWIN)
ifndef USE_NATIVE
Binary file not shown.
20 changes: 20 additions & 0 deletions test/cuda/common/clang/cuda.h
@@ -0,0 +1,20 @@
/* Minimal declarations for CUDA support. Testing purposes only. */

#include <stddef.h>

#define __constant__ __attribute__((constant))
#define __device__ __attribute__((device))
#define __global__ __attribute__((global))
#define __host__ __attribute__((host))
#define __shared__ __attribute__((shared))
#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))

struct dim3 {
unsigned x, y, z;
__host__ __device__ dim3(unsigned x, unsigned y = 1, unsigned z = 1) : x(x), y(y), z(z) {}
};

typedef struct cudaStream *cudaStream_t;

int cudaConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0,
cudaStream_t stream = 0);
8 changes: 8 additions & 0 deletions test/cuda/common/clang/float_gpu.cu
@@ -0,0 +1,8 @@
#include <stdint.h>
#include "clang/cuda.h"
extern "C"
__global__
void add(float a, float b, float *c)
{
*c = a + b;
}
14 changes: 14 additions & 0 deletions test/cuda/common/clang/fmadd_gpu.cu
@@ -0,0 +1,14 @@
#include <stdint.h>
#include "clang/cuda.h"
__global__
void add(float *a, float *b, float *c, int n)
{
int i = __builtin_ptx_read_ctaid_x() * __builtin_ptx_read_ntid_x()
+ __builtin_ptx_read_tid_x();
int j = __builtin_ptx_read_ctaid_y() * __builtin_ptx_read_ntid_y()
+ __builtin_ptx_read_tid_y();
if (i < n && j < n) {
int idx = i * n + j;
c[idx] = a[idx] + b[idx];
}
}
14 changes: 14 additions & 0 deletions test/cuda/common/clang/fmmul_gpu.cu
@@ -0,0 +1,14 @@
#include <stdint.h>
#include "clang/cuda.h"
__global__
void mul(float *a, float *b, float *c, int n)
{
int i = __builtin_ptx_read_ctaid_x() * __builtin_ptx_read_ntid_x()
+ __builtin_ptx_read_tid_x();
int j = __builtin_ptx_read_ctaid_y() * __builtin_ptx_read_ntid_y()
+ __builtin_ptx_read_tid_y();
if (i < n && j < n) {
int idx = i * n + j;
c[idx] = a[idx] * b[idx];
}
}
26 changes: 26 additions & 0 deletions test/cuda/common/clang/idle_gpu.cu
@@ -0,0 +1,26 @@
#include <stdint.h>
#include "clang/cuda.h"

extern "C"
__global__
void idle(unsigned int *p, unsigned int n)
{
int x = __builtin_ptx_read_ctaid_x() * __builtin_ptx_read_ntid_x()
+ __builtin_ptx_read_tid_x();
int y = __builtin_ptx_read_ctaid_y() * __builtin_ptx_read_ntid_y()
+ __builtin_ptx_read_tid_y();
unsigned int i = 0, j = 0, k = 0;
__shared__ int s;

s = *p;
if (x == 0 && y == 0) {
for (i = 0; i < n; i++) {
if (x + y > n) {
s = s + x;
if (s > x + y)
s = x;
}
}
}
*p = s;
}
13 changes: 13 additions & 0 deletions test/cuda/common/clang/loop_gpu.cu
@@ -0,0 +1,13 @@
#include <stdint.h>
#include "clang/cuda.h"

__global__
void loop(uint32_t *data, uint32_t size, uint32_t n)
{
int i;
// for (i = 0; i < n/40; i++) {
for (i = 0; i < n/5; i++) {
if (i * 4 < size)
data[i] = i + n;
}
}
14 changes: 14 additions & 0 deletions test/cuda/common/clang/madd_gpu.cu
@@ -0,0 +1,14 @@
#include <stdint.h>
#include "clang/cuda.h"
__global__
void add(uint32_t *a, uint32_t *b, uint32_t *c, uint32_t n)
{
int i = __builtin_ptx_read_ctaid_x() * __builtin_ptx_read_ntid_x()
+ __builtin_ptx_read_tid_x();
int j = __builtin_ptx_read_ctaid_y() * __builtin_ptx_read_ntid_y()
+ __builtin_ptx_read_tid_y();
if (i < n && j < n) {
int idx = i * n + j;
c[idx] = a[idx] + b[idx];
}
}
21 changes: 21 additions & 0 deletions test/cuda/common/clang/mmul_gpu.cu
@@ -0,0 +1,21 @@
#include "clang/cuda.h"

extern "C" __global__ void multiply(unsigned int *a, unsigned int *b, unsigned int *c,
int n)
{
unsigned int i;
unsigned int product = 0;

int row = __builtin_ptx_read_ctaid_y() * __builtin_ptx_read_ntid_y()
+ __builtin_ptx_read_tid_y();
int col = __builtin_ptx_read_ctaid_x() * __builtin_ptx_read_ntid_x()
+ __builtin_ptx_read_tid_x();

if(row < n && col < n){
for (i = 0; i < n; i++)
product += a[row * n + i] * b[i * n + col];

c[row*n + col] = product;
}
}

18 changes: 18 additions & 0 deletions test/cuda/user/float/Makefile.llvm
@@ -0,0 +1,18 @@
# Makefile
TARGET = user_test
ARCH = sm_20
CC = gcc
CLANG = clang -cc1 -nostdsysteminc -triple nvptx-unknown-unknown -fcuda-is-device -emit-llvm
LLC = llc -march=nvptx64 -mcpu=$(ARCH)
PTXAS = ptxas -arch $(ARCH)
LIBS = -lucuda -lgdev
CFLAGS = -L /usr/local/gdev/lib64 -I /usr/local/gdev/include

all:
$(CLANG) -I . -o float_gpu.ll clang/float_gpu.cu
$(LLC) -o float_gpu.ptx float_gpu.ll
$(PTXAS) -o float_gpu.cubin float_gpu.ptx
$(CC) -o $(TARGET) $(CFLAGS) main.c float.c $(LIBS)

clean:
rm -f $(TARGET) *.cubin *.ptx *.ll ./*~
1 change: 1 addition & 0 deletions test/cuda/user/float/clang/cuda.h
1 change: 1 addition & 0 deletions test/cuda/user/float/clang/float_gpu.cu
18 changes: 18 additions & 0 deletions test/cuda/user/fmadd/Makefile.llvm
@@ -0,0 +1,18 @@
# Makefile
TARGET = user_test
ARCH = sm_20
CC = gcc
CLANG = clang -cc1 -nostdsysteminc -triple nvptx-unknown-unknown -fcuda-is-device -emit-llvm
LLC = llc -march=nvptx64 -mcpu=$(ARCH)
PTXAS = ptxas -arch $(ARCH)
LIBS = -lucuda -lgdev
CFLAGS = -L /usr/local/gdev/lib64 -I /usr/local/gdev/include

all:
$(CLANG) -I . -o fmadd_gpu.ll clang/fmadd_gpu.cu
$(LLC) -o fmadd_gpu.ptx fmadd_gpu.ll
$(PTXAS) -o fmadd_gpu.cubin fmadd_gpu.ptx
$(CC) -o $(TARGET) $(CFLAGS) main.c fmadd.c $(LIBS)

clean:
rm -f $(TARGET) *.cubin *.ptx *.ll ./*~
1 change: 1 addition & 0 deletions test/cuda/user/fmadd/clang/cuda.h
1 change: 1 addition & 0 deletions test/cuda/user/fmadd/clang/fmadd_gpu.cu
18 changes: 18 additions & 0 deletions test/cuda/user/fmmul/Makefile.llvm
@@ -0,0 +1,18 @@
# Makefile
TARGET = user_test
ARCH = sm_20
CC = gcc
CLANG = clang -cc1 -nostdsysteminc -triple nvptx-unknown-unknown -fcuda-is-device -emit-llvm
LLC = llc -march=nvptx64 -mcpu=$(ARCH)
PTXAS = ptxas -arch $(ARCH)
LIBS = -lucuda -lgdev
CFLAGS = -L /usr/local/gdev/lib64 -I /usr/local/gdev/include

all:
$(CLANG) -I . -o fmmul_gpu.ll clang/fmmul_gpu.cu
$(LLC) -o fmmul_gpu.ptx fmmul_gpu.ll
$(PTXAS) -o fmmul_gpu.cubin fmmul_gpu.ptx
$(CC) -o $(TARGET) $(CFLAGS) main.c fmmul.c $(LIBS)

clean:
rm -f $(TARGET) *.cubin *.ptx *.ll ./*~
1 change: 1 addition & 0 deletions test/cuda/user/fmmul/clang/cuda.h
1 change: 1 addition & 0 deletions test/cuda/user/fmmul/clang/fmmul_gpu.cu
18 changes: 18 additions & 0 deletions test/cuda/user/idle/Makefile.llvm
@@ -0,0 +1,18 @@
# Makefile
TARGET = user_test
ARCH = sm_20
CC = gcc
CLANG = clang -cc1 -nostdsysteminc -triple nvptx-unknown-unknown -fcuda-is-device -emit-llvm
LLC = llc -march=nvptx64 -mcpu=$(ARCH)
PTXAS = ptxas -arch $(ARCH)
LIBS = -lucuda -lgdev
CFLAGS = -L /usr/local/gdev/lib64 -I /usr/local/gdev/include

all:
$(CLANG) -I . -o idle_gpu.ll clang/idle_gpu.cu
$(LLC) -o idle_gpu.ptx idle_gpu.ll
$(PTXAS) -o idle_gpu.cubin idle_gpu.ptx
$(CC) -o $(TARGET) $(CFLAGS) main.c idle.c $(LIBS)

clean:
rm -f $(TARGET) *.cubin *.ptx *.ll ./*~
1 change: 1 addition & 0 deletions test/cuda/user/idle/clang/cuda.h
1 change: 1 addition & 0 deletions test/cuda/user/idle/clang/idle_gpu.cu
18 changes: 18 additions & 0 deletions test/cuda/user/loop/Makefile.llvm
@@ -0,0 +1,18 @@
# Makefile
TARGET = user_test
ARCH = sm_20
CC = gcc
CLANG = clang -cc1 -nostdsysteminc -triple nvptx-unknown-unknown -fcuda-is-device -emit-llvm
LLC = llc -march=nvptx64 -mcpu=$(ARCH)
PTXAS = ptxas -arch $(ARCH)
LIBS = -lucuda -lgdev
CFLAGS = -L /usr/local/gdev/lib64 -I /usr/local/gdev/include

all:
$(CLANG) -I . -o loop_gpu.ll clang/loop_gpu.cu
$(LLC) -o loop_gpu.ptx loop_gpu.ll
$(PTXAS) -o loop_gpu.cubin loop_gpu.ptx
$(CC) -o $(TARGET) $(CFLAGS) main.c loop.c $(LIBS)

clean:
rm -f $(TARGET) *.cubin *.ptx *.ll ./*~

0 comments on commit 4b91fcf

Please sign in to comment.