Permalink
Browse files

gdev: added asynchronous memcpy (not tested well)

gdev: added gref() and guref() API
cuda: added Stream Management
  • Loading branch information...
Shinpei Kato
Shinpei Kato committed Mar 16, 2012
1 parent 2046147 commit 3820d7634349c8d7333bb627cd2ab04d574a935d
View
@@ -1131,6 +1131,44 @@ int gshmctl(Ghandle h, int id, int cmd, void *buf)
return ret;
}
+/**
+ * gref():
+ * reference device virtual memory of handle @hsrc from handle @hdst.
+ * this API can be used alone - no need to call other gshm* APIs a priori.
+ */
+uint64_t gref(Ghandle hmaster, uint64_t addr, uint64_t size, Ghandle hslave)
+{
+ gdev_mem_t *mem, *new;
+
+ mem = gdev_mem_lookup(hmaster->vas, addr, GDEV_MEM_DEVICE);
+ if (!mem)
+ return 0;
+
+ new = gdev_shm_attach(hslave->vas, mem, size);
+ if (!new)
+ return 0;
+
+ return gdev_mem_getaddr(new);
+}
+
+/**
+ * gunref():
+ * unreference device virtual memory from the shared region.
+ */
+int gunref(Ghandle h, uint64_t addr)
+{
+ gdev_vas_t *vas = h->vas;
+ gdev_mem_t *mem;
+
+ mem = gdev_mem_lookup(vas, addr, GDEV_MEM_DEVICE);
+ if (!mem)
+ return -ENOENT;
+
+ gdev_shm_detach(mem);
+
+ return 0;
+}
+
/**
* gphysget():
* get the physical (PCI) bus address associated with buffer pointer @p
View
@@ -68,6 +68,8 @@ int gshmget(Ghandle h, int key, uint64_t size, int flags);
uint64_t gshmat(Ghandle h, int id, uint64_t addr, int flags);
int gshmdt(Ghandle h, uint64_t addr);
int gshmctl(Ghandle h, int id, int cmd, void *buf);
+uint64_t gref(Ghandle hmaster, uint64_t addr, uint64_t size, Ghandle hslave);
+int gunref(Ghandle h, uint64_t addr);
uint64_t gphysget(Ghandle h, void *p);
uint64_t gvirtget(Ghandle h, void *p);
View
@@ -30,7 +30,12 @@
#define __GDEV_IOCTL_DEF_H__
/**
- * user-space ioctl commands:
+ * utility ioctl commands:
+ */
+#define GDEV_IOCTL_GET_HANDLE 0x10
+
+/**
+ * user-space ioctl commands for Gdev API:
*/
#define GDEV_IOCTL_GMALLOC 0x100
#define GDEV_IOCTL_GFREE 0x101
@@ -52,8 +57,14 @@
#define GDEV_IOCTL_GSHMAT 0x117
#define GDEV_IOCTL_GSHMDT 0x118
#define GDEV_IOCTL_GSHMCTL 0x119
-#define GDEV_IOCTL_GPHYSGET 0x120
-#define GDEV_IOCTL_GVIRTGET 0x121
+#define GDEV_IOCTL_GREF 0x120
+#define GDEV_IOCTL_GUNREF 0x121
+#define GDEV_IOCTL_GPHYSGET 0x122
+#define GDEV_IOCTL_GVIRTGET 0x123
+
+struct gdev_ioctl_handle {
+ uint64_t handle;
+};
struct gdev_ioctl_mem {
uint64_t addr;
@@ -105,6 +116,17 @@ struct gdev_ioctl_map {
uint64_t size;
};
+struct gdev_ioctl_ref {
+ uint64_t addr;
+ uint64_t size;
+ uint64_t handle_slave;
+ uint64_t addr_slave;
+};
+
+struct gdev_ioctl_unref {
+ uint64_t addr;
+};
+
struct gdev_ioctl_phys {
uint64_t addr;
uint64_t phys;
View
@@ -97,7 +97,7 @@ CUresult cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev)
struct CUctx_st *ctx;
struct gdev_cuda_info *cuda_info;
Ghandle handle;
- int minor = dev;
+ int minor = (int)dev;
if (!gdev_initialized)
return CUDA_ERROR_NOT_INITIALIZED;
@@ -156,6 +156,8 @@ CUresult cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev)
/* we will trace # of kernels. */
ctx->launch_id = 0;
+ /* save the device ID. */
+ ctx->minor = minor;
gdev_ctx_current = ctx; /* set to the current context. */
*pctx = ctx;
@@ -313,7 +315,7 @@ CUresult cuCtxPopCurrent(CUcontext *pctx)
CUresult cuCtxSynchronize(void)
{
Ghandle handle;
- struct gdev_cuda_launch *l;
+ struct gdev_cuda_fence *f;
struct gdev_list *p;
if (!gdev_initialized)
@@ -327,17 +329,17 @@ CUresult cuCtxSynchronize(void)
handle = gdev_ctx_current->gdev_handle;
/* synchronize with all kernels. */
- gdev_list_for_each(l, &gdev_ctx_current->sync_list, list_entry) {
+ gdev_list_for_each(f, &gdev_ctx_current->sync_list, list_entry) {
/* if timeout is required, specify gdev_time value instead of NULL. */
- if (gsync(handle, l->id, NULL))
+ if (gsync(handle, f->id, NULL))
return CUDA_ERROR_UNKNOWN;
}
/* remove all lists. */
while ((p = gdev_list_head(&gdev_ctx_current->sync_list))) {
gdev_list_del(p);
- l = gdev_list_container(p);
- FREE(l);
+ f = gdev_list_container(p);
+ FREE(f);
}
if (gbarrier(handle))
View
@@ -791,6 +791,13 @@ CUresult cuMemUnmap(void *buf);
/* Memory mapped address - Gdev extension */
CUresult cuMemGetPhysAddr(unsigned long long *addr, void *p);
+/* Stream Management */
+CUresult cuStreamCreate(CUstream *phStream, unsigned int Flags);
+CUresult cuStreamDestroy(CUstream hStream);
+CUresult cuStreamQuery(CUstream hStream);
+CUresult cuStreamSynchronize(CUstream hStream);
+CUresult cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags);
+
/* Inter-Process Communication (IPC) - Gdev extension */
CUresult cuShmGet(int *ptr, int key, size_t size, int flags);
CUresult cuShmAt(CUdeviceptr *dptr, int id, int flags);
@@ -146,7 +146,7 @@ CUresult cuLaunchGrid(CUfunction f, int grid_width, int grid_height)
struct CUmod_st *mod = func->mod;
struct CUctx_st *ctx = mod->ctx;
struct gdev_kernel *k;
- struct gdev_cuda_launch *l;
+ struct gdev_cuda_fence *fence;
Ghandle handle;
if (!gdev_initialized)
@@ -155,7 +155,7 @@ CUresult cuLaunchGrid(CUfunction f, int grid_width, int grid_height)
return CUDA_ERROR_INVALID_CONTEXT;
if (!func || grid_width <= 0 || grid_height <= 0)
return CUDA_ERROR_INVALID_VALUE;
- if (!(l = MALLOC(sizeof(*l))))
+ if (!(fence = (struct gdev_cuda_fence *)MALLOC(sizeof(*fence))))
return CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES;
k = &func->kernel;
@@ -169,10 +169,11 @@ CUresult cuLaunchGrid(CUfunction f, int grid_width, int grid_height)
handle = gdev_ctx_current->gdev_handle;
- if (glaunch(handle, k, &l->id))
+ if (glaunch(handle, k, &fence->id))
return CUDA_ERROR_LAUNCH_FAILED;
- gdev_list_init(&l->list_entry, l);
- gdev_list_add(&l->list_entry, &ctx->sync_list);
+ fence->addr_ref = 0; /* no address to unreference later. */
+ gdev_list_init(&fence->list_entry, fence);
+ gdev_list_add(&fence->list_entry, &ctx->sync_list);
return CUDA_SUCCESS;
}
@@ -68,8 +68,9 @@ struct gdev_cuda_raw_func {
uint32_t local_size_neg;
};
-struct gdev_cuda_launch {
- uint32_t id; /* kernel ID returned by the launch function. */
+struct gdev_cuda_fence {
+ uint32_t id; /* fence ID returned by the Gdev API. */
+ uint64_t addr_ref; /* only used for asynchronous memcpy. */
struct gdev_list list_entry; /* entry to synchronization list. */
};
@@ -87,6 +88,7 @@ struct CUctx_st {
struct gdev_list sync_list;
struct gdev_cuda_info cuda_info;
int launch_id;
+ int minor;
};
struct CUmod_st {
@@ -126,6 +128,9 @@ struct CUevent_st {
};
struct CUstream_st {
+ Ghandle gdev_handle;
+ struct CUctx_st *ctx;
+ struct gdev_list sync_list; /* for gdev_cuda_fence.list_entry */
};
struct CUgraphicsResource_st {
View
@@ -201,8 +201,7 @@ CUresult cuMemFreeHost(void *p)
* CUDA_SUCCESS, CUDA_ERROR_DEINITIALIZED, CUDA_ERROR_NOT_INITIALIZED,
* CUDA_ERROR_INVALID_CONTEXT, CUDA_ERROR_INVALID_VALUE
*/
-CUresult cuMemcpyHtoD
-(CUdeviceptr dstDevice, const void *srcHost, unsigned int ByteCount)
+CUresult cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, unsigned int ByteCount)
{
Ghandle handle;
const void *src_buf = srcHost;
@@ -244,12 +243,53 @@ CUresult cuMemcpyHtoD
* CUDA_SUCCESS, CUDA_ERROR_DEINITIALIZED, CUDA_ERROR_NOT_INITIALIZED,
* CUDA_ERROR_INVALID_CONTEXT, CUDA_ERROR_INVALID_VALUE
*/
-CUresult cuMemcpyHtoDAsync
-(CUdeviceptr dstDevice, const void *srcHost, unsigned int ByteCount,
- CUstream hStream)
+CUresult cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost, unsigned int ByteCount, CUstream hStream)
{
- GDEV_PRINT("cuMemcpyHtoD: Not Implemented Yet\n");
+ Ghandle handle, handle_ref;
+ struct CUstream_st *stream = hStream;
+ const void *src_buf = srcHost;
+ uint64_t dst_addr = dstDevice;
+ uint64_t dst_addr_ref;
+ uint32_t size = ByteCount;
+ struct gdev_cuda_fence *fence;
+ uint32_t id;
+
+ if (!stream)
+ return cuMemcpyHtoD(dst_addr, src_buf, size);
+
+ if (!gdev_initialized)
+ return CUDA_ERROR_NOT_INITIALIZED;
+ if (!src_buf || !dst_addr || !size)
+ return CUDA_ERROR_INVALID_VALUE;
+ if (gdev_ctx_current != stream->ctx)
+ return CUDA_ERROR_INVALID_CONTEXT;
+
+ fence = (struct gdev_cuda_fence *)MALLOC(sizeof(*fence));
+ if (!fence)
+ return CUDA_ERROR_OUT_OF_MEMORY; /* this API shouldn't return it... */
+
+ handle = gdev_ctx_current->gdev_handle;
+ handle_ref = stream->gdev_handle;
+
+ if (!(dst_addr_ref = gref(handle, dst_addr, size, handle_ref)))
+ goto fail_gref;
+
+ if (gmemcpy_to_device_async(handle_ref, dst_addr_ref, src_buf, size, &id))
+ goto fail_gmemcpy;
+
+ fence->id = id;
+ fence->addr_ref = dst_addr_ref;
+ gdev_list_init(&fence->list_entry, fence);
+ gdev_list_add(&fence->list_entry, &stream->sync_list);
+
return CUDA_SUCCESS;
+
+fail_gmemcpy:
+ gunref(handle_ref, dst_addr_ref);
+fail_gref:
+ FREE(fence);
+
+ return CUDA_ERROR_UNKNOWN;
}
/**
@@ -266,8 +306,7 @@ CUresult cuMemcpyHtoDAsync
* CUDA_SUCCESS, CUDA_ERROR_DEINITIALIZED, CUDA_ERROR_NOT_INITIALIZED,
* CUDA_ERROR_INVALID_CONTEXT, CUDA_ERROR_INVALID_VALUE
*/
-CUresult cuMemcpyDtoH
-(void *dstHost, CUdeviceptr srcDevice, unsigned int ByteCount)
+CUresult cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, unsigned int ByteCount)
{
Ghandle handle;
void *dst_buf = dstHost;
@@ -309,11 +348,53 @@ CUresult cuMemcpyDtoH
* CUDA_SUCCESS, CUDA_ERROR_DEINITIALIZED, CUDA_ERROR_NOT_INITIALIZED,
* CUDA_ERROR_INVALID_CONTEXT, CUDA_ERROR_INVALID_VALUE
*/
-CUresult cuMemcpyDtoHAsync
-(void *dstHost, CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hstream)
+CUresult cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream)
{
- GDEV_PRINT("cuMemcpyDtoH: Not Implemented Yet\n");
+ Ghandle handle, handle_ref;
+ struct CUstream_st *stream = hStream;
+ void *dst_buf = dstHost;
+ uint64_t src_addr = srcDevice;
+ uint64_t src_addr_ref;
+ uint32_t size = ByteCount;
+ struct gdev_cuda_fence *fence;
+ uint32_t id;
+
+ if (!stream)
+ return cuMemcpyDtoH(dst_buf, src_addr, size);
+
+ if (!gdev_initialized)
+ return CUDA_ERROR_NOT_INITIALIZED;
+ if (!dst_buf || !src_addr || !size)
+ return CUDA_ERROR_INVALID_VALUE;
+ if (gdev_ctx_current != stream->ctx)
+ return CUDA_ERROR_INVALID_CONTEXT;
+
+ fence = (struct gdev_cuda_fence *)MALLOC(sizeof(*fence));
+ if (!fence)
+ return CUDA_ERROR_OUT_OF_MEMORY; /* this API shouldn't return it... */
+
+ handle = gdev_ctx_current->gdev_handle;
+ handle_ref = stream->gdev_handle;
+
+ if (!(src_addr_ref = gref(handle, src_addr, size, handle_ref)))
+ goto fail_gref;
+
+ if (gmemcpy_from_device_async(handle_ref, dst_buf, src_addr_ref, size, &id))
+ goto fail_gmemcpy;
+
+ fence->id = id;
+ fence->addr_ref = src_addr_ref;
+ gdev_list_init(&fence->list_entry, fence);
+ gdev_list_add(&fence->list_entry, &stream->sync_list);
+
return CUDA_SUCCESS;
+
+fail_gmemcpy:
+ gunref(handle_ref, src_addr_ref);
+fail_gref:
+ FREE(fence);
+
+ return CUDA_ERROR_UNKNOWN;
}
/**
Oops, something went wrong.

0 comments on commit 3820d76

Please sign in to comment.