Permalink
Browse files

cuda: implemented module and context management

cuda: fixed minor issues in init, device, and version management
gdev: added header files to be installed to /usr/local/gdev/include
  • Loading branch information...
1 parent d2a083c commit 180bc32b3a3d5d9a80f9eb143b209616f71d8473 Shinpei Kato committed Nov 15, 2011
View
1 common/gdev_nvidia_def.h
@@ -51,6 +51,7 @@
*/
struct gdev_kernel {
uint64_t code_addr; /* code address in VAS */
+ uint32_t code_size; /* code size */
uint32_t code_pc; /* initial program counter */
struct gdev_cmem {
uint64_t addr; /* constant memory address in VAS */
View
6 cuda/driver_api/Makefile
@@ -4,7 +4,7 @@ CC = gcc
TARGET = libcuda
GDEVDIR = /usr/local/gdev
CFLAGS = -O3 -Wall -I$(GDEVDIR)/include
-HEADERS = {cuda.h}
+HEADERS = cuda.h
OBJS = $(patsubst %.c,%.o,$(wildcard ./*.c))
ZOMBIE = $(wildcard ./*~)
@@ -18,11 +18,11 @@ all: $(OBJS)
install:
cp -f ./$(TARGET).so.1 $(GDEVDIR)/lib64/$(TARGET).so.1
ln -sf $(GDEVDIR)/lib64/$(TARGET).so.1 $(GDEVDIR)/lib64/$(TARGET).so
- cp -f ./$(HEADERS) $(GDEVDIR)/include
+ cp -f ./{$(HEADERS)} $(GDEVDIR)/include
uninstall:
rm -f $(GDEVDIR)/lib64/$(TARGET).*
- rm -f $(GDEVDIR)/include/$(HEADERS)
+ rm -f $(GDEVDIR)/include/{$(HEADERS)}
clean:
rm -f $(TARGET).so.* $(OBJS) $(ZOMBIE)
View
123 cuda/driver_api/context.c
@@ -25,8 +25,13 @@
*/
#include "cuda.h"
+#include "gdev_cuda.h"
#include "gdev_api.h"
#include "gdev_cuda.h"
+#include "gdev_list.h"
+
+struct CUctx_st *gdev_ctx_current = NULL;
+gdev_list_t gdev_ctx_list;
/**
* Creates a new CUDA context and associates it with the calling thread.
@@ -89,9 +94,11 @@
*/
CUresult cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev)
{
- int minor = dev;
+ CUresult res;
struct CUctx_st *ctx;
+ struct gdev_cuda_info *cuda_info;
gdev_handle_t *handle;
+ int minor = dev;
if (!gdev_initialized)
return CUDA_ERROR_NOT_INITIALIZED;
@@ -100,17 +107,47 @@ CUresult cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev)
if (!pctx)
return CUDA_ERROR_INVALID_VALUE;
- if (!(ctx = (CUcontext)malloc(sizeof(*ctx))))
- return CUDA_ERROR_OUT_OF_MEMORY;
+ if (!(ctx = (CUcontext)malloc(sizeof(*ctx)))) {
+ res = CUDA_ERROR_OUT_OF_MEMORY;
+ goto fail_malloc_ctx;
+ }
if (!(handle = gopen(minor))) {
- return CUDA_ERROR_UNKNOWN;
+ res = CUDA_ERROR_UNKNOWN;
+ goto fail_open_gdev;
}
+ /* save the Gdev handle. */
ctx->gdev_handle = handle;
+
+ /* get the CUDA-specific device information. */
+ cuda_info = &ctx->cuda_info;
+ if (gquery(handle, GDEV_NVIDIA_QUERY_MP_COUNT, &cuda_info->mp_count)) {
+ res = CUDA_ERROR_UNKNOWN;
+ goto fail_query_mp_count;
+ }
+ /* FIXME: per-thread warp size and the number of active warps may not be
+ stack numbers. */
+ cuda_info->warp_count = 48;
+ cuda_info->warp_size = 32;
+
+ /* save the current context to the stack, if necessary. */
+ __gdev_list_init(&ctx->list_entry, ctx);
+ if (gdev_ctx_current) {
+ __gdev_list_add(&gdev_ctx_current->list_entry, &gdev_ctx_list);
+ }
+
+ gdev_ctx_current = ctx; /* set to the current context. */
*pctx = ctx;
return CUDA_SUCCESS;
+
+fail_query_mp_count:
+ gclose(handle);
+fail_open_gdev:
+ free(ctx);
+fail_malloc_ctx:
+ return res;
}
/**
@@ -128,13 +165,20 @@ CUresult cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev)
*/
CUresult cuCtxDestroy(CUcontext ctx)
{
+ gdev_list_t *list_head;
+
if (!gdev_initialized)
return CUDA_ERROR_NOT_INITIALIZED;
if (!ctx)
return CUDA_ERROR_INVALID_VALUE;
if (gclose(ctx->gdev_handle))
return CUDA_ERROR_INVALID_CONTEXT;
+ list_head = __gdev_list_head(&gdev_ctx_list);
+ gdev_ctx_current = __gdev_list_container(list_head);
+ if (gdev_ctx_current)
+ __gdev_list_del(&gdev_ctx_current->list_entry);
+
free(ctx);
return CUDA_SUCCESS;
@@ -158,15 +202,78 @@ CUresult cuCtxDetach(CUcontext ctx)
return CUDA_SUCCESS;
}
-CUresult cuCtxPopCurrent(CUcontext *pctx)
+/**
+ * Pushes the given context @ctx onto the CPU thread's stack of current
+ * contexts. The specified context becomes the CPU thread's current context,
+ * so all CUDA functions that operate on the current context are affected.
+ *
+ * The previous current context may be made current again by calling
+ * cuCtxDestroy() or cuCtxPopCurrent().
+ *
+ * The context must be "floating," i.e. not attached to any thread. Contexts
+ * are made to float by calling cuCtxPopCurrent().
+ *
+ * Parameters:
+ * ctx - Floating context to attach
+ *
+ * Returns:
+ * CUDA_SUCCESS, CUDA_ERROR_DEINITIALIZED, CUDA_ERROR_NOT_INITIALIZED,
+ * CUDA_ERROR_INVALID_CONTEXT, CUDA_ERROR_INVALID_VALUE
+ */
+CUresult cuCtxPushCurrent(CUcontext ctx)
{
- printf("cuCtxPopCurrent: Not Implemented Yet\n");
+ if (!gdev_initialized)
+ return CUDA_ERROR_NOT_INITIALIZED;
+ if (!ctx)
+ return CUDA_ERROR_INVALID_VALUE;
+ if (!gdev_ctx_current)
+ return CUDA_ERROR_INVALID_CONTEXT;
+
+ /* save the current context to the stack. */
+ __gdev_list_add(&gdev_ctx_current->list_entry, &gdev_ctx_list);
+ /* set @ctx to the current context. */
+ gdev_ctx_current = ctx;
+
return CUDA_SUCCESS;
}
-CUresult cuCtxPushCurrent(CUcontext ctx)
+/**
+ * Pops the current CUDA context from the CPU thread. The CUDA context must
+ * have a usage count of 1. CUDA contexts have a usage count of 1 upon
+ * creation; the usage count may be incremented with cuCtxAttach() and
+ * decremented with cuCtxDetach().
+ *
+ * If successful, cuCtxPopCurrent() passes back the new context handle in
+ * @pctx. The old context may then be made current to a different CPU thread
+ * by calling cuCtxPushCurrent().
+ *
+ * Floating contexts may be destroyed by calling cuCtxDestroy().
+ *
+ * If a context was current to the CPU thread before cuCtxCreate() or
+ * cuCtxPushCurrent() was called, this function makes that context current to
+ * the CPU thread again.
+ *
+ * Parameters:
+ * pctx - Returned new context handle
+ *
+ * Returns:
+ * CUDA_SUCCESS, CUDA_ERROR_DEINITIALIZED, CUDA_ERROR_NOT_INITIALIZED,
+ * CUDA_ERROR_INVALID_CONTEXT
+ */
+CUresult cuCtxPopCurrent(CUcontext *pctx)
{
- printf("cuCtxPushCurrent: Not Implemented Yet\n");
+ gdev_list_t *list_head;
+ if (!gdev_initialized)
+ return CUDA_ERROR_NOT_INITIALIZED;
+ if (!pctx)
+ return CUDA_ERROR_INVALID_CONTEXT;
+
+ *pctx = gdev_ctx_current;
+ list_head = __gdev_list_head(&gdev_ctx_list);
+ gdev_ctx_current = __gdev_list_container(list_head);
+ if (gdev_ctx_current)
+ __gdev_list_del(&gdev_ctx_current->list_entry);
+
return CUDA_SUCCESS;
}
View
3 cuda/driver_api/cuda.h
@@ -514,7 +514,4 @@ CUresult cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int nu
CUresult cuModuleLoadFatBinary(CUmodule *module, const void *fatCubin);
CUresult cuModuleUnload(CUmodule hmod);
-extern int gdev_initialized;
-extern int gdev_device_count;
-
#endif
View
3 cuda/driver_api/device.c
@@ -25,9 +25,10 @@
*/
#include "cuda.h"
+#include "gdev_cuda.h"
#include <stdio.h>
-CUdevice gdev_fd = -1;
+int gdev_device_count = 0;
/**
* Returns in *major and *minor the major and minor revision numbers that
View
70 cuda/driver_api/gdev_cuda.h
@@ -28,47 +28,75 @@
#define __GDEV_CUDA_H__
#define GDEV_CUDA_VERSION 4000
+#define GDEV_CUDA_USER_PARAM_BASE 0x20
+#define GDEV_CUDA_CMEM_SEGMENT_COUNT 16 /* by definition? */
#ifndef NULL
#define NULL 0
#endif
-#ifndef TRUE
-#define TRUE 1
-#endif
-#ifndef FALSE
-#define FALSE 0
-#endif
#include "gdev_api.h"
+#include "gdev_list.h"
+
+struct gdev_cuda_info {
+ uint32_t mp_count;
+ uint32_t warp_count;
+ uint32_t warp_size;
+};
+
+struct gdev_cuda_raw_func {
+ char *name;
+ void *code_buf;
+ uint32_t code_size;
+ struct gdev_cuda_cmem {
+ void *buf;
+ uint32_t size;
+ } cmem[GDEV_NVIDIA_CONST_SEGMENT_MAX_COUNT];
+ uint32_t reg_count;
+ uint32_t bar_count;
+ uint32_t stack_depth;
+ uint32_t shared_size;
+ uint32_t param_size;
+ uint32_t local_size;
+ uint32_t local_size_neg;
+};
struct CUctx_st {
gdev_handle_t *gdev_handle;
+ gdev_list_t list_entry;
+ struct gdev_cuda_info cuda_info;
};
struct CUmod_st {
+ FILE *fp;
+ void *bin;
+ void *image_buf;
+ uint64_t image_addr;
+ uint64_t local_addr;
+ uint32_t image_size;
+ uint32_t local_size;
+ uint32_t func_count;
+ gdev_list_t func_list;
};
struct CUfunc_st {
+ struct gdev_kernel kernel;
+ struct gdev_cuda_raw_func raw_func;
+ gdev_list_t list_entry;
};
struct CUtexref_st {
};
-struct gdev_const {
- void *buf;
- uint32_t size;
-};
+CUresult gdev_cuda_load_cubin(struct CUmod_st *mod, const char *fname);
+CUresult gdev_cuda_unload_cubin(struct CUmod_st *mod);
+void gdev_cuda_setup_kernels(struct CUmod_st *mod, struct gdev_cuda_info *info);
+CUresult gdev_cuda_assign_image(struct CUmod_st *mod);
+CUresult gdev_cuda_assign_local(struct CUmod_st *mod);
-struct gdev_cubin {
- void *code_buf;
- uint32_t code_size;
- struct gdev_const c[16]; /* 16 by definition? */
- uint32_t reg_count;
- uint32_t barriers;
- uint32_t stack_depth;
- uint32_t shared_size;
- uint32_t param_size;
- uint32_t local_size;
-};
+extern int gdev_initialized;
+extern int gdev_device_count;
+struct CUctx_st *gdev_ctx_current;
+extern gdev_list_t gdev_ctx_list;
#endif
View
3 cuda/driver_api/init.c
@@ -25,12 +25,12 @@
*/
#include "cuda.h"
+#include "gdev_cuda.h"
#include <fcntl.h>
#include <stdio.h>
#include <sys/unistd.h>
int gdev_initialized = 0;
-int gdev_device_count = 0;
/**
* Initializes the driver API and must be called before any other function
@@ -70,6 +70,7 @@ CUresult cuInit(unsigned int Flags)
return CUDA_ERROR_INVALID_DEVICE;
gdev_device_count = minor;
+ __gdev_list_init(&gdev_ctx_list, NULL);
return CUDA_SUCCESS;
}
View
124 cuda/driver_api/module.c
@@ -28,9 +28,101 @@
#include "gdev_api.h"
#include "gdev_cuda.h"
+/**
+ * Takes a filename fname and loads the corresponding module module into the
+ * current context. The CUDA driver API does not attempt to lazily allocate
+ * the resources needed by a module; if the memory for functions and data
+ * (constant and global) needed by the module cannot be allocated,
+ * cuModuleLoad() fails. The file should be a cubin file as output by nvcc
+ * or a PTX file, either as output by nvcc or handwrtten.
+ *
+ * Parameters:
+ * module - Returned module
+ * fname - Filename of module to load
+ *
+ * Returns:
+ * CUDA_SUCCESS, CUDA_ERROR_DEINITIALIZED, CUDA_ERROR_NOT_INITIALIZED,
+ * CUDA_ERROR_INVALID_CONTEXT, CUDA_ERROR_INVALID_VALUE, CUDA_ERROR_NOT_FOUND,
+ * CUDA_ERROR_OUT_OF_MEMORY, CUDA_ERROR_FILE_NOT_FOUND
+ */
CUresult cuModuleLoad(CUmodule *module, const char *fname)
{
+ CUresult res;
+ struct CUmod_st *mod;
+ struct CUctx_st *ctx;
+ gdev_handle_t *handle;
+
+ if (!gdev_initialized)
+ return CUDA_ERROR_NOT_INITIALIZED;
+ if (!module || fname)
+ return CUDA_ERROR_INVALID_VALUE;
+ if (!gdev_ctx_current)
+ return CUDA_ERROR_INVALID_CONTEXT;
+
+ ctx = gdev_ctx_current;
+ handle = ctx->gdev_handle;
+
+ if (!(mod = malloc(sizeof(*mod)))) {
+ res = CUDA_ERROR_OUT_OF_MEMORY;
+ goto fail_malloc_mod;
+ }
+
+ /* load the cubin image from the given object file. */
+ if ((res = gdev_cuda_load_cubin(mod, fname)) != CUDA_SUCCESS)
+ goto fail_load_cubin;
+
+ /* setup the kernels based on the cubin data. */
+ gdev_cuda_setup_kernels(mod, &ctx->cuda_info);
+
+ /* allocate local memory, and assign it to each function. */
+ if (!(mod->local_addr = gmalloc(handle, mod->local_size))) {
+ res = CUDA_ERROR_OUT_OF_MEMORY;
+ goto fail_gmalloc_local;
+ }
+
+ if ((res = gdev_cuda_assign_local(mod)))
+ goto fail_assign_local;
+
+ /* allocate code and constant memory and assign it to each function. */
+ if (!(mod->image_addr = gmalloc(handle, mod->image_size)))
+ goto fail_gmalloc_image;
+ /* this malloc() and memcpy() in gdev_cuda_setup_image() could be
+ removed if we use gmalloc_host() here, the following is just an easy
+ implementation, and doesn't affect performance much anyway. */
+ if (!(mod->image_buf = malloc(mod->image_size))) {
+ res = CUDA_ERROR_OUT_OF_MEMORY;
+ goto fail_malloc_image;
+ }
+ memset(mod->image_buf, 0, mod->image_size);
+ if ((res = gdev_cuda_assign_image(mod)))
+ goto fail_assign_image;
+
+ /* transfer the code and constant memory onto the device. */
+ if (gmemcpy_to_device(handle, mod->image_addr, mod->image_buf,
+ mod->image_size)) {
+ res = CUDA_ERROR_UNKNOWN;
+ goto fail_gmemcpy;
+ }
+
+ *module = mod;
+
return CUDA_SUCCESS;
+
+fail_gmemcpy:
+fail_assign_image:
+ free(mod->image_buf);
+fail_malloc_image:
+ gfree(handle, mod->image_addr);
+fail_gmalloc_image:
+fail_assign_local:
+ gfree(handle, mod->local_addr);
+fail_gmalloc_local:
+ gdev_cuda_unload_cubin(mod);
+fail_load_cubin:
+ free(mod);
+fail_malloc_mod:
+ *module = NULL;
+ return res;
}
CUresult cuModuleLoadFatBinary(CUmodule *module, const void *fatCubin)
@@ -39,8 +131,40 @@ CUresult cuModuleLoadFatBinary(CUmodule *module, const void *fatCubin)
return CUDA_SUCCESS;
}
+/**
+ * Unloads a module hmod from the current context.
+ *
+ * Parameters:
+ * hmod - Module to unload
+ *
+ * Returns:
+ * CUDA_SUCCESS, CUDA_ERROR_DEINITIALIZED, CUDA_ERROR_NOT_INITIALIZED,
+ * CUDA_ERROR_INVALID_CONTEXT, CUDA_ERROR_INVALID_VALUE
+ */
CUresult cuModuleUnload(CUmodule hmod)
{
+ CUresult res;
+ struct CUmod_st *mod = hmod;
+ gdev_handle_t *handle;
+
+ if (!gdev_initialized)
+ return CUDA_ERROR_NOT_INITIALIZED;
+ if (!mod)
+ return CUDA_ERROR_INVALID_VALUE;
+ if (!gdev_ctx_current)
+ return CUDA_ERROR_INVALID_CONTEXT;
+
+ handle = gdev_ctx_current->gdev_handle;
+
+ free(mod->image_buf);
+ gfree(handle, mod->image_addr);
+ gfree(handle, mod->local_addr);
+
+ if ((res = gdev_cuda_unload_cubin(mod)) != CUDA_SUCCESS)
+ return res;
+
+ free(mod);
+
return CUDA_SUCCESS;
}
View
2 cuda/driver_api/version.c
@@ -41,7 +41,7 @@
CUresult cuDriverGetVersion(int *driverVersion)
{
if (!driverVersion)
- return NULL;
+ return CUDA_ERROR_INVALID_VALUE;
*driverVersion = GDEV_CUDA_VERSION;
return CUDA_SUCCESS;
View
4 driver/gdev/install.sh
@@ -35,7 +35,5 @@ if [ ! -d $gdevinc ]; then
mkdir $gdevinc
fi
cp -f Module.symvers $gdevdir
-cp -f gdev_drv.h $gdevinc
-cp -f gdev_api.h $gdevinc
-cp -f gdev_nvidia_def.h $gdevinc
+cp -f {gdev_api.h,gdev_drv.h,gdev_nvidia_def.h,gdev_list.h,gdev_time.h} $gdevinc
View
14 runtime/kernel/Makefile
@@ -1,10 +1,10 @@
# Makefile
-CC = gcc
-TARGET = libgdev
-CFLAGS = -O3 -Wall
-GDEVDIR = /usr/local/gdev
-HEADERS = {gdev_api.h,gdev_lib.h,gdev_nvidia_def.h}
+CC = gcc
+TARGET = libgdev
+CFLAGS = -O3 -Wall
+GDEVDIR = /usr/local/gdev
+HEADERS = {gdev_api.h,gdev_lib.h,gdev_nvidia_def.h,gdev_list.h,gdev_time.h}
OBJS = $(patsubst %.c,%.o,$(wildcard ./*.c))
ZOMBIE = $(wildcard ./*~)
@@ -21,11 +21,11 @@ install:
@if test ! -d $(GDEVDIR)/include ; then mkdir $(GDEVDIR)/include ; fi ;
cp -f ./$(TARGET).so.1 $(GDEVDIR)/lib64/$(TARGET).so.1
ln -sf $(GDEVDIR)/lib64/$(TARGET).so.1 $(GDEVDIR)/lib64/$(TARGET).so
- cp -f ./$(HEADERS) $(GDEVDIR)/include
+ cp -f ./{$(HEADERS)} $(GDEVDIR)/include
uninstall:
rm -f $(GDEVDIR)/lib64/$(TARGET).*
- rm -f $(GDEVDIR)/include/$(HEADERS)
+ rm -f $(GDEVDIR)/include/{$(HEADERS)}
clean:
rm -f $(TARGET).so.* $(OBJS) $(ZOMBIE)
View
14 runtime/user/gdev/Makefile
@@ -1,10 +1,10 @@
# Makefile
-CC = gcc
-TARGET = libgdev
-CFLAGS = -O3 -Wall
-GDEVDIR = /usr/local/gdev
-HEADERS = {gdev_api.h,gdev_lib.h,gdev_nvidia_def.h}
+CC = gcc
+TARGET = libgdev
+CFLAGS = -O3 -Wall
+GDEVDIR = /usr/local/gdev
+HEADERS = gdev_api.h,gdev_lib.h,gdev_nvidia_def.h,gdev_list.h,gdev_time.h
OBJS = $(patsubst %.c,%.o,$(wildcard ./*.c))
ZOMBIE = $(wildcard ./*~)
@@ -21,11 +21,11 @@ install:
@if test ! -d $(GDEVDIR)/include ; then mkdir $(GDEVDIR)/include ; fi ;
cp -f ./$(TARGET).so.1.0.0 $(GDEVDIR)/lib64/$(TARGET).so.1
ln -sf $(GDEVDIR)/lib64/$(TARGET).so.1 $(GDEVDIR)/lib64/$(TARGET).so
- cp -f ./$(HEADERS) $(GDEVDIR)/include
+ cp -f ./{$(HEADERS)} $(GDEVDIR)/include
uninstall:
rm -f $(GDEVDIR)/lib64/$(TARGET).*
- rm -f $(GDEVDIR)/include/$(HEADERS)
+ rm -f $(GDEVDIR)/include/{$(HEADERS)}
clean:
rm -f $(TARGET).so.* $(OBJS) $(ZOMBIE)
View
6 tests/common/matrixadd.c
@@ -146,7 +146,7 @@ int gdev_test_matrixadd(uint32_t *a, uint32_t *b, uint32_t *c, int n)
code_size = sizeof(kcode);
if (code_size & 0xff)
- code_size = (code_size + 0x100) & ~0xff;
+ k.code_size = (code_size + 0x100) & ~0xff;
a_size = n * n * sizeof(uint32_t);
b_size = n * n * sizeof(uint32_t);
c_size = n * n * sizeof(uint32_t);
@@ -192,7 +192,7 @@ int gdev_test_matrixadd(uint32_t *a, uint32_t *b, uint32_t *c, int n)
return -1;
if (!(c_addr = gmalloc(handle, c_size)))
return -1;
- if (!(k.code_addr = gmalloc(handle, code_size)))
+ if (!(k.code_addr = gmalloc(handle, k.code_size)))
return -1;
if (!(k.cmem[0].addr = gmalloc(handle, k.cmem[0].size)))
return -1;
@@ -231,7 +231,7 @@ int gdev_test_matrixadd(uint32_t *a, uint32_t *b, uint32_t *c, int n)
k.grid_y = n / k.block_y;
k.grid_z = 1;
- gmemcpy_to_device(handle, k.code_addr, kcode, code_size);
+ gmemcpy_to_device(handle, k.code_addr, kcode, k.code_size);
gmemcpy_to_device(handle, a_addr, a, a_size);
gmemcpy_to_device(handle, b_addr, b, b_size);

0 comments on commit 180bc32

Please sign in to comment.