Skip to content
Browse files

cuda: fixed the size of constant memory segments from 16 to 32

cuda: added c{1,15,16,17}[] setup for nvcc (though don't know what they are...)
  • Loading branch information...
1 parent 043ce5c commit dbe61d2516e666fd104435f9a8f28aa22c670b39 Shinpei Kato committed Dec 19, 2011
View
2 bench/rodinia/cuda/lud/cuda/lud.c
@@ -225,7 +225,6 @@ main ( int argc, char *argv[] )
}
if (do_verify){
- printf("Before LUD\n");
print_matrix(m, matrix_dim);
matrix_duplicate(m, &mm, matrix_dim);
@@ -301,7 +300,6 @@ main ( int argc, char *argv[] )
}
if (do_verify){
- printf("After LUD\n");
print_matrix(m, matrix_dim);
printf(">>>Verify<<<<\n");
lud_verify(mm, m, matrix_dim);
View
3 common/gdev_nvidia_def.h
@@ -34,7 +34,7 @@
/**
* static numbers for nvidia GPUs.
*/
-#define GDEV_NVIDIA_CONST_SEGMENT_MAX_COUNT 16 /* by definition? */
+#define GDEV_NVIDIA_CONST_SEGMENT_MAX_COUNT 32 /* by definition? */
/**
* query values for the device-specific information.
@@ -63,7 +63,6 @@ struct gdev_kernel {
uint32_t offset; /* offset in constant memory */
} cmem[GDEV_NVIDIA_CONST_SEGMENT_MAX_COUNT];
uint32_t cmem_count; /* constant memory count */
- uint32_t cmem_param_segment; /* constant memory segment for parameters */
uint32_t param_size; /* kernel parameter size */
uint32_t *param_buf; /* kernel parameter buffer */
uint64_t lmem_addr; /* local memory address in VAS */
View
3 cuda/driver_api/context.c
@@ -151,8 +151,9 @@ CUresult cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev)
gdev_list_add(&gdev_ctx_current->list_entry, &gdev_ctx_list);
}
- /* we will trace size of memory allocated by users. */
+ /* we will trace size of memory allocated by users and # of kernels. */
ctx->data_size = 0;
+ ctx->launch_id = 0;
gdev_ctx_current = ctx; /* set to the current context. */
*pctx = ctx;
View
7 cuda/driver_api/execution.c
@@ -56,13 +56,12 @@ CUresult cuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z)
struct CUmod_st *mod = func->mod;
struct CUctx_st *ctx = mod->ctx;
struct gdev_kernel *k;
- int nr_max_threads = ctx->cuda_info.warp_size * 32;
if (!gdev_initialized)
return CUDA_ERROR_NOT_INITIALIZED;
if (!ctx || ctx != gdev_ctx_current)
return CUDA_ERROR_INVALID_CONTEXT;
- if (!func || x <= 0 || y <= 0 || z <= 0 || x * y * z > nr_max_threads)
+ if (!func || x <= 0 || y <= 0 || z <= 0)
return CUDA_ERROR_INVALID_VALUE;
k = &func->kernel;
@@ -101,7 +100,7 @@ CUresult cuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes)
return CUDA_ERROR_INVALID_VALUE;
k = &func->kernel;
- k->smem_size += gdev_cuda_align_smem_size(bytes);
+ k->smem_size = gdev_cuda_align_smem_size(k->smem_size + bytes);
return CUDA_SUCCESS;
}
@@ -160,7 +159,7 @@ CUresult cuLaunchGrid(CUfunction f, int grid_width, int grid_height)
k->grid_x = grid_width;
k->grid_y = grid_height;
k->grid_z = 1;
- k->grid_id = 1;
+ k->grid_id = ++ctx->launch_id;
k->smem_base = gdev_cuda_align_base(ctx->data_size);
k->lmem_base = gdev_cuda_align_base(k->smem_base + k->smem_size);
View
16 cuda/driver_api/gdev_cuda.c
@@ -222,7 +222,6 @@ static void init_kernel(struct gdev_kernel *k)
k->cmem[i].offset = 0;
}
k->cmem_count = 0;
- k->cmem_param_segment = 0;
k->param_buf = NULL;
k->param_size = 0;
k->lmem_addr = 0;
@@ -620,15 +619,25 @@ CUresult gdev_cuda_construct_kernels
if (!(k->param_buf = MALLOC(k->param_size)))
goto fail_malloc_param;
+ /* the following c[] setup is NVIDIA's nvcc-specific. */
+ k->cmem_count = GDEV_NVIDIA_CONST_SEGMENT_MAX_COUNT;
+ /* c0[] is a parameter list. */
memcpy(k->param_buf, f->cmem[0].buf, f->param_base);
k->cmem[0].size = gdev_cuda_align_cmem_size(f->param_size);
k->cmem[0].offset = 0;
for (i = 1; i < GDEV_NVIDIA_CONST_SEGMENT_MAX_COUNT; i++) {
k->cmem[i].size = gdev_cuda_align_cmem_size(f->cmem[i].size);
k->cmem[i].offset = 0; /* no usage. */
}
- k->cmem_count = GDEV_NVIDIA_CONST_SEGMENT_MAX_COUNT;
- k->cmem_param_segment = 0; /* c0[] is used for parameters in nvcc. */
+ /* c{1,15,16,17}[] are something unknown... */
+ k->cmem[1].size = 0x10000;
+ k->cmem[1].offset = 0;
+ k->cmem[15].size = 0x10000;
+ k->cmem[15].offset = 0;
+ k->cmem[16].size = k->cmem[0].size;
+ k->cmem[16].offset = 0;
+ k->cmem[17].size = k->cmem[0].size;
+ k->cmem[17].offset = 0;
/* FIXME: what is the right local memory size?
the blob trace says lmem_size > 0xf0 and lmem_size_neg > 0x7fc.
@@ -646,7 +655,6 @@ CUresult gdev_cuda_construct_kernels
/* stack level needs rounded up? */
if (stack_depth % warp_count != 0)
k->stack_level++;
- k->stack_level = 8;
/* FIXME: what is the right stack size? */
stack_size = k->stack_level * 0x10;
View
2 cuda/driver_api/gdev_cuda.h
@@ -29,7 +29,6 @@
#define GDEV_CUDA_VERSION 4000
#define GDEV_CUDA_USER_PARAM_BASE 0x20
-#define GDEV_CUDA_CMEM_SEGMENT_COUNT 16 /* by definition? */
#ifndef NULL
#define NULL 0
@@ -79,6 +78,7 @@ struct CUctx_st {
struct gdev_list list_entry;
struct gdev_cuda_info cuda_info;
uint64_t data_size;
+ int launch_id;
};
struct CUmod_st {

0 comments on commit dbe61d2

Please sign in to comment.
Something went wrong with that request. Please try again.