Permalink
Browse files

gdev: fixed gref(), gunref()

gdev: name changes: gmemcpy_in_device() -> gmemcpy
gdev: added gmemcpy_async()
cuda: fixed cuMemcpyHtoDAsync() and cuMemcpyDtoHAsync()
  • Loading branch information...
1 parent 88e6193 commit 9fd7f4bf921e41b2ce6dacb7956e849e2ce0ab3d Shinpei Kato committed Mar 20, 2012
View
205 common/gdev_api.c
@@ -218,11 +218,17 @@ static int __gmemcpy_to_device_locked(gdev_ctx_t *ctx, uint64_t dst_addr, const
if (size <= 4) {
gdev_write32(mem, dst_addr, ((uint32_t*)src_buf)[0]);
ret = 0;
+ /* if @id is give while not asynchronous, give it zero. */
+ if (id)
+ *id = 0;
}
else if (size <= GDEV_MEMCPY_IORW_LIMIT) {
ret = gdev_write(mem, dst_addr, src_buf, size);
+ /* if @id is give while not asynchronous, give it zero. */
+ if (id)
+ *id = 0;
}
- else if ((hmem = gdev_mem_lookup(vas, (uint64_t)src_buf, GDEV_MEM_DMA))) {
+ else if ((hmem = gdev_mem_lookup_by_buf(vas, src_buf, GDEV_MEM_DMA))) {
ret = __gmemcpy_dma_to_device(ctx, dst_addr, hmem->addr, size, id);
}
else {
@@ -244,6 +250,10 @@ static int __gmemcpy_to_device_locked(gdev_ctx_t *ctx, uint64_t dst_addr, const
/* free bounce buffer memory, if necessary. */
if (!dma_mem)
__free_dma(bmem, p_count);
+
+ /* if @id is give while not asynchronous, give it zero. */
+ if (id)
+ *id = 0;
}
return ret;
@@ -260,12 +270,13 @@ static int __gmemcpy_to_device(struct gdev_handle *h, uint64_t dst_addr, const v
#endif
gdev_vas_t *vas = h->vas;
gdev_ctx_t *ctx = h->ctx;
- gdev_mem_t *mem = gdev_mem_lookup(vas, dst_addr, GDEV_MEM_DEVICE);
gdev_mem_t **dma_mem = h->dma_mem;
+ gdev_mem_t *mem;
uint32_t ch_size = h->chunk_size;
int p_count = h->pipeline_count;
int ret;
+ mem = gdev_mem_lookup_by_addr(vas, dst_addr, GDEV_MEM_DEVICE);
if (!mem)
return -ENOENT;
@@ -277,9 +288,8 @@ static int __gmemcpy_to_device(struct gdev_handle *h, uint64_t dst_addr, const v
gdev_mem_lock(mem);
gdev_shm_evict_conflict(ctx, mem); /* evict conflicting data. */
- ret = __gmemcpy_to_device_locked(ctx, dst_addr, src_buf, size, id,
- ch_size, p_count, vas, mem, dma_mem,
- host_copy);
+ ret = __gmemcpy_to_device_locked(ctx, dst_addr, src_buf, size, id, ch_size, p_count, vas, mem, dma_mem, host_copy);
+
gdev_mem_unlock(mem);
#ifndef GDEV_SCHED_DISABLED
@@ -419,11 +429,17 @@ static int __gmemcpy_from_device_locked(gdev_ctx_t *ctx, void *dst_buf, uint64_t
if (size <= 4) {
((uint32_t*)dst_buf)[0] = gdev_read32(mem, src_addr);
ret = 0;
+ /* if @id is give while not asynchronous, give it zero. */
+ if (id)
+ *id = 0;
}
else if (size <= GDEV_MEMCPY_IORW_LIMIT) {
ret = gdev_read(mem, dst_buf, src_addr, size);
+ /* if @id is give while not asynchronous, give it zero. */
+ if (id)
+ *id = 0;
}
- else if ((hmem = gdev_mem_lookup(vas, (uint64_t)dst_buf, GDEV_MEM_DMA))) {
+ else if ((hmem = gdev_mem_lookup_by_buf(vas, dst_buf, GDEV_MEM_DMA))) {
ret = __gmemcpy_dma_from_device(ctx, hmem->addr, src_addr, size, id);
}
else {
@@ -444,6 +460,10 @@ static int __gmemcpy_from_device_locked(gdev_ctx_t *ctx, void *dst_buf, uint64_t
/* free bounce buffer memory, if necessary. */
if (!dma_mem)
__free_dma(bmem, p_count);
+
+ /* if @id is give while not asynchronous, give it zero. */
+ if (id)
+ *id = 0;
}
return ret;
@@ -460,12 +480,13 @@ static int __gmemcpy_from_device(struct gdev_handle *h, void *dst_buf, uint64_t
#endif
gdev_vas_t *vas = h->vas;
gdev_ctx_t *ctx = h->ctx;
- gdev_mem_t *mem = gdev_mem_lookup(vas, src_addr, GDEV_MEM_DEVICE);
gdev_mem_t **dma_mem = h->dma_mem;
+ gdev_mem_t *mem;
uint32_t ch_size = h->chunk_size;
int p_count = h->pipeline_count;
int ret;
+ mem = gdev_mem_lookup_by_addr(vas, src_addr, GDEV_MEM_DEVICE);
if (!mem)
return -ENOENT;
@@ -498,11 +519,15 @@ int gdev_callback_save_to_host(void *h, void* dst_buf, uint64_t src_addr, uint64
{
gdev_vas_t *vas = ((struct gdev_handle*)h)->vas;
gdev_ctx_t *ctx = ((struct gdev_handle*)h)->ctx;
- gdev_mem_t *mem = gdev_mem_lookup(vas, src_addr, GDEV_MEM_DEVICE);
gdev_mem_t **dma_mem = ((struct gdev_handle*)h)->dma_mem;
+ gdev_mem_t *mem;
uint32_t ch_size = ((struct gdev_handle*)h)->chunk_size;
int p_count = ((struct gdev_handle*)h)->pipeline_count;
+ mem = gdev_mem_lookup_by_addr(vas, src_addr, GDEV_MEM_DEVICE);
+ if (!mem)
+ return -ENOENT;
+
return __gmemcpy_from_device_locked(ctx, dst_buf, src_addr, size, NULL, ch_size, p_count, vas, mem, dma_mem, __f_memcpy);
}
@@ -527,11 +552,15 @@ int gdev_callback_load_from_host(void *h, uint64_t dst_addr, void *src_buf, uint
{
gdev_vas_t *vas = ((struct gdev_handle*)h)->vas;
gdev_ctx_t *ctx = ((struct gdev_handle*)h)->ctx;
- gdev_mem_t *mem = gdev_mem_lookup(vas, dst_addr, GDEV_MEM_DEVICE);
gdev_mem_t **dma_mem = ((struct gdev_handle*)h)->dma_mem;
+ gdev_mem_t *mem;
uint32_t ch_size = ((struct gdev_handle*)h)->chunk_size;
int p_count = ((struct gdev_handle*)h)->pipeline_count;
+ mem = gdev_mem_lookup_by_addr(vas, dst_addr, GDEV_MEM_DEVICE);
+ if (!mem)
+ return -ENOENT;
+
return __gmemcpy_to_device_locked(ctx, dst_addr, src_buf, size, NULL, ch_size, p_count, vas, mem, dma_mem, __f_memcpy);
}
@@ -729,7 +758,7 @@ uint64_t gfree(struct gdev_handle *h, uint64_t addr)
gdev_mem_t *mem;
uint64_t size;
- if (!(mem = gdev_mem_lookup(vas, addr, GDEV_MEM_DEVICE)))
+ if (!(mem = gdev_mem_lookup_by_addr(vas, addr, GDEV_MEM_DEVICE)))
goto fail;
size = gdev_mem_getsize(mem);
gdev_mem_free(mem);
@@ -777,7 +806,7 @@ uint64_t gfree_dma(struct gdev_handle *h, void *buf)
gdev_mem_t *mem;
uint64_t size;
- if (!(mem = gdev_mem_lookup(vas, (uint64_t)buf, GDEV_MEM_DMA)))
+ if (!(mem = gdev_mem_lookup_by_buf(vas, buf, GDEV_MEM_DMA)))
goto fail;
size = gdev_mem_getsize(mem);
gdev_mem_free(mem);
@@ -800,7 +829,7 @@ void *gmap(struct gdev_handle *h, uint64_t addr, uint64_t size)
gdev_mem_t *mem;
uint64_t offset;
- if (!(mem = gdev_mem_lookup(vas, addr, GDEV_MEM_DEVICE)))
+ if (!(mem = gdev_mem_lookup_by_addr(vas, addr, GDEV_MEM_DEVICE)))
goto fail;
offset = addr - gdev_mem_getaddr(mem);
@@ -820,7 +849,7 @@ int gunmap(struct gdev_handle *h, void *buf)
gdev_mem_t *mem;
uint32_t type = GDEV_MEM_DEVICE | GDEV_MEM_DMA;
- if (!(mem = gdev_mem_lookup(vas, (uint64_t)buf, type)))
+ if (!(mem = gdev_mem_lookup_by_buf(vas, buf, type)))
goto fail;
gdev_mem_unmap(mem);
@@ -905,24 +934,35 @@ int gmemcpy_user_from_device_async(struct gdev_handle *h, void *dst_buf, uint64_
}
/**
- * gmemcpy_in_device():
- * copy data of the given size within the device memory.
+ * gmemcpy():
+ * copy data of the given size within the global address space.
+ * this could be HtoD, DtoH, DtoD, and HtoH.
*/
-int gmemcpy_in_device
-(struct gdev_handle *h, uint64_t dst_addr, uint64_t src_addr, uint64_t size)
+int gmemcpy(struct gdev_handle *h, uint64_t dst_addr, uint64_t src_addr, uint64_t size)
{
#ifndef GDEV_SCHED_DISABLED
struct gdev_sched_entity *se = h->se;
struct gdev_device *gdev = h->gdev;
#endif
gdev_ctx_t *ctx = h->ctx;
gdev_vas_t *vas = h->vas;
- gdev_mem_t *dst = gdev_mem_lookup(vas, dst_addr, GDEV_MEM_DEVICE);
- gdev_mem_t *src = gdev_mem_lookup(vas, src_addr, GDEV_MEM_DEVICE);
+ gdev_mem_t *dst;
+ gdev_mem_t *src;
uint32_t fence;
- if (!dst || !src)
- return -ENOENT;
+ dst = gdev_mem_lookup_by_addr(vas, dst_addr, GDEV_MEM_DEVICE);
+ if (!dst) {
+ dst = gdev_mem_lookup_by_addr(vas, dst_addr, GDEV_MEM_DMA);
+ if (!dst)
+ return -ENOENT;
+ }
+
+ src = gdev_mem_lookup_by_addr(vas, src_addr, GDEV_MEM_DEVICE);
+ if (!src) {
+ src = gdev_mem_lookup_by_addr(vas, src_addr, GDEV_MEM_DMA);
+ if (!src)
+ return -ENOENT;
+ }
#ifndef GDEV_SCHED_DISABLED
/* decide if the context needs to stall or not. */
@@ -947,6 +987,60 @@ int gmemcpy_in_device
}
/**
+ * gmemcpy_async():
+ * asynchronously copy data of the given size within the global address space.
+ * this could be HtoD, DtoH, DtoD, and HtoH.
+ */
+int gmemcpy_async(struct gdev_handle *h, uint64_t dst_addr, uint64_t src_addr, uint64_t size, uint32_t *id)
+{
+#ifndef GDEV_SCHED_DISABLED
+ struct gdev_sched_entity *se = h->se;
+ struct gdev_device *gdev = h->gdev;
+#endif
+ gdev_ctx_t *ctx = h->ctx;
+ gdev_vas_t *vas = h->vas;
+ gdev_mem_t *dst;
+ gdev_mem_t *src;
+ uint32_t fence;
+
+ dst = gdev_mem_lookup_by_addr(vas, dst_addr, GDEV_MEM_DEVICE);
+ if (!dst) {
+ dst = gdev_mem_lookup_by_addr(vas, dst_addr, GDEV_MEM_DMA);
+ if (!dst)
+ return -ENOENT;
+ }
+
+ src = gdev_mem_lookup_by_addr(vas, src_addr, GDEV_MEM_DEVICE);
+ if (!src) {
+ src = gdev_mem_lookup_by_addr(vas, src_addr, GDEV_MEM_DMA);
+ if (!src)
+ return -ENOENT;
+ }
+
+#ifndef GDEV_SCHED_DISABLED
+ /* decide if the context needs to stall or not. */
+ gdev_schedule_memory(se);
+#endif
+
+ gdev_mem_lock(dst);
+ gdev_mem_lock(src);
+
+ fence = gdev_memcpy_async(ctx, dst_addr, src_addr, size);
+
+ gdev_mem_unlock(src);
+ gdev_mem_unlock(dst);
+
+#ifndef GDEV_SCHED_DISABLED
+ /* this should be done upon interrupt. */
+ gdev_select_next_memory(gdev);
+#endif
+
+ *id = fence;
+
+ return 0;
+}
+
+/**
* glaunch():
* launch the GPU kernel code.
*/
@@ -980,6 +1074,9 @@ int glaunch(struct gdev_handle *h, struct gdev_kernel *kernel, uint32_t *id)
*/
int gsync(struct gdev_handle *h, uint32_t id, struct gdev_time *timeout)
{
+ /* @id could be zero if users have called memcpy_async in a wrong way. */
+ if (id == 0)
+ return 0;
return gdev_poll(h->ctx, id, timeout);
}
@@ -1052,6 +1149,9 @@ int gshmget(Ghandle h, int key, uint64_t size, int flags)
gdev_vas_t *vas = h->vas;
int id;
+ if (key == 0 || size == 0)
+ return -EINVAL;
+
gdev_mutex_lock(&gdev->shm_mutex);
id = gdev_shm_create(gdev, vas, key, size, flags);
gdev_mutex_unlock(&gdev->shm_mutex);
@@ -1095,7 +1195,7 @@ int gshmdt(Ghandle h, uint64_t addr)
gdev_mem_t *mem;
gdev_mutex_lock(&gdev->shm_mutex);
- if (!(mem = gdev_mem_lookup(vas, addr, GDEV_MEM_DEVICE)))
+ if (!(mem = gdev_mem_lookup_by_addr(vas, addr, GDEV_MEM_DEVICE)))
goto fail;
gdev_shm_detach(mem);
gdev_mutex_unlock(&gdev->shm_mutex);
@@ -1141,16 +1241,19 @@ int gshmctl(Ghandle h, int id, int cmd, void *buf)
/**
* gref():
- * reference device virtual memory of handle @hsrc from handle @hdst.
- * this API can be used alone - no need to call other gshm* APIs a priori.
+ * reference virtual memory from handle @hsrc to handle @hdst.
*/
uint64_t gref(Ghandle hmaster, uint64_t addr, uint64_t size, Ghandle hslave)
{
gdev_mem_t *mem, *new;
- mem = gdev_mem_lookup(hmaster->vas, addr, GDEV_MEM_DEVICE);
- if (!mem)
- return 0;
+ mem = gdev_mem_lookup_by_addr(hmaster->vas, addr, GDEV_MEM_DEVICE);
+ if (!mem) {
+ /* try to find a host DMA memory object. */
+ mem = gdev_mem_lookup_by_addr(hmaster->vas, addr, GDEV_MEM_DMA);
+ if (!mem)
+ return 0;
+ }
new = gdev_shm_attach(hslave->vas, mem, size);
if (!new)
@@ -1161,16 +1264,20 @@ uint64_t gref(Ghandle hmaster, uint64_t addr, uint64_t size, Ghandle hslave)
/**
* gunref():
- * unreference device virtual memory from the shared region.
+ * unreference virtual memory from the shared region.
*/
int gunref(Ghandle h, uint64_t addr)
{
gdev_vas_t *vas = h->vas;
gdev_mem_t *mem;
- mem = gdev_mem_lookup(vas, addr, GDEV_MEM_DEVICE);
- if (!mem)
- return -ENOENT;
+ mem = gdev_mem_lookup_by_addr(vas, addr, GDEV_MEM_DEVICE);
+ if (!mem) {
+ /* try to find a host DMA memory object. */
+ mem = gdev_mem_lookup_by_addr(vas, addr, GDEV_MEM_DMA);
+ if (!mem)
+ return -ENOENT;
+ }
gdev_shm_detach(mem);
@@ -1181,25 +1288,21 @@ int gunref(Ghandle h, uint64_t addr)
* gphysget():
* get the physical (PCI) bus address associated with buffer pointer @p
*/
-uint64_t gphysget(Ghandle h, void *p)
+uint64_t gphysget(Ghandle h, const void *p)
{
gdev_vas_t *vas = h->vas;
gdev_mem_t *mem;
- uint32_t type = GDEV_MEM_DMA;
uint64_t offset;
- mem = gdev_mem_lookup(vas, (uint64_t)p, type);
- if (mem)
- offset = (uint64_t)p - (uint64_t)gdev_mem_getbuf(mem);
- else {
- type |= GDEV_MEM_DEVICE;
- mem = gdev_mem_lookup(vas, (uint64_t)p, type);
- if (mem)
- offset = (uint64_t)p - (uint64_t)gdev_mem_getbuf(mem);
- else
+ mem = gdev_mem_lookup_by_buf(vas, p, GDEV_MEM_DEVICE);
+ if (!mem) {
+ mem = gdev_mem_lookup_by_buf(vas, p, GDEV_MEM_DMA);
+ if (!mem)
goto fail;
}
+ offset = (uint64_t)p - (uint64_t)gdev_mem_getbuf(mem);
+
return gdev_mem_phys_getaddr(mem, offset);
fail:
@@ -1210,25 +1313,21 @@ uint64_t gphysget(Ghandle h, void *p)
* gvirtget():
* get the unified virtual address associated with buffer pointer @p
*/
-uint64_t gvirtget(Ghandle h, void *p)
+uint64_t gvirtget(Ghandle h, const void *p)
{
gdev_vas_t *vas = h->vas;
gdev_mem_t *mem;
- uint32_t type = GDEV_MEM_DMA;
uint64_t offset;
- mem = gdev_mem_lookup(vas, (uint64_t)p, type);
- if (mem)
- offset = (uint64_t)p - (uint64_t)gdev_mem_getbuf(mem);
- else {
- type |= GDEV_MEM_DEVICE;
- mem = gdev_mem_lookup(vas, (uint64_t)p, type);
- if (mem)
- offset = (uint64_t)p - (uint64_t)gdev_mem_getbuf(mem);
- else
+ mem = gdev_mem_lookup_by_buf(vas, p, GDEV_MEM_DEVICE);
+ if (!mem) {
+ mem = gdev_mem_lookup_by_buf(vas, p, GDEV_MEM_DMA);
+ if (!mem)
goto fail;
}
+ offset = (uint64_t)p - (uint64_t)gdev_mem_getbuf(mem);
+
return gdev_mem_getaddr(mem) + offset;
fail:
View
7 common/gdev_api.h
@@ -58,7 +58,8 @@ int gmemcpy_from_device(Ghandle h, void *dst_buf, uint64_t src_addr, uint64_t si
int gmemcpy_from_device_async(Ghandle h, void *dst_buf, uint64_t src_addr, uint64_t size, uint32_t *id);
int gmemcpy_user_from_device(Ghandle h, void *dst_buf, uint64_t src_addr, uint64_t size);
int gmemcpy_user_from_device_async(Ghandle h, void *dst_buf, uint64_t src_addr, uint64_t size, uint32_t *id);
-int gmemcpy_in_device(Ghandle h, uint64_t dst_addr, uint64_t src_addr, uint64_t size);
+int gmemcpy(Ghandle h, uint64_t dst_addr, uint64_t src_addr, uint64_t size);
+int gmemcpy_async(Ghandle h, uint64_t dst_addr, uint64_t src_addr, uint64_t size, uint32_t *id);
int glaunch(Ghandle h, struct gdev_kernel *kernel, uint32_t *id);
int gsync(Ghandle h, uint32_t id, struct gdev_time *timeout);
int gbarrier(Ghandle h);
@@ -70,8 +71,8 @@ int gshmdt(Ghandle h, uint64_t addr);
int gshmctl(Ghandle h, int id, int cmd, void *buf);
uint64_t gref(Ghandle hmaster, uint64_t addr, uint64_t size, Ghandle hslave);
int gunref(Ghandle h, uint64_t addr);
-uint64_t gphysget(Ghandle h, void *p);
-uint64_t gvirtget(Ghandle h, void *p);
+uint64_t gphysget(Ghandle h, const void *p);
+uint64_t gvirtget(Ghandle h, const void *p);
/**
View
3 common/gdev_arch.h
@@ -80,7 +80,8 @@ void gdev_mem_free(gdev_mem_t *mem);
void gdev_mem_gc(gdev_vas_t *vas);
void *gdev_mem_map(gdev_mem_t *mem, uint64_t offset, uint64_t size);
void gdev_mem_unmap(gdev_mem_t *mem);
-gdev_mem_t *gdev_mem_lookup(gdev_vas_t *vas, uint64_t addr, int type);
+gdev_mem_t *gdev_mem_lookup_by_addr(gdev_vas_t *vas, uint64_t addr, int type);
+gdev_mem_t *gdev_mem_lookup_by_buf(gdev_vas_t *vas, const void *buf, int type);
void *gdev_mem_getbuf(gdev_mem_t *mem);
uint64_t gdev_mem_getaddr(gdev_mem_t *mem);
uint64_t gdev_mem_getsize(gdev_mem_t *mem);
View
29 common/gdev_ioctl_def.h
@@ -47,20 +47,21 @@
#define GDEV_IOCTL_GMEMCPY_TO_DEVICE_ASYNC 0x107
#define GDEV_IOCTL_GMEMCPY_FROM_DEVICE 0x108
#define GDEV_IOCTL_GMEMCPY_FROM_DEVICE_ASYNC 0x109
-#define GDEV_IOCTL_GMEMCPY_IN_DEVICE 0x110
-#define GDEV_IOCTL_GLAUNCH 0x111
-#define GDEV_IOCTL_GSYNC 0x112
-#define GDEV_IOCTL_GBARRIER 0x113
-#define GDEV_IOCTL_GQUERY 0x114
-#define GDEV_IOCTL_GTUNE 0x115
-#define GDEV_IOCTL_GSHMGET 0x116
-#define GDEV_IOCTL_GSHMAT 0x117
-#define GDEV_IOCTL_GSHMDT 0x118
-#define GDEV_IOCTL_GSHMCTL 0x119
-#define GDEV_IOCTL_GREF 0x120
-#define GDEV_IOCTL_GUNREF 0x121
-#define GDEV_IOCTL_GPHYSGET 0x122
-#define GDEV_IOCTL_GVIRTGET 0x123
+#define GDEV_IOCTL_GMEMCPY 0x110
+#define GDEV_IOCTL_GMEMCPY_ASYNC 0x111
+#define GDEV_IOCTL_GLAUNCH 0x112
+#define GDEV_IOCTL_GSYNC 0x113
+#define GDEV_IOCTL_GBARRIER 0x114
+#define GDEV_IOCTL_GQUERY 0x115
+#define GDEV_IOCTL_GTUNE 0x116
+#define GDEV_IOCTL_GSHMGET 0x117
+#define GDEV_IOCTL_GSHMAT 0x118
+#define GDEV_IOCTL_GSHMDT 0x119
+#define GDEV_IOCTL_GSHMCTL 0x120
+#define GDEV_IOCTL_GREF 0x121
+#define GDEV_IOCTL_GUNREF 0x122
+#define GDEV_IOCTL_GPHYSGET 0x123
+#define GDEV_IOCTL_GVIRTGET 0x124
struct gdev_ioctl_handle {
uint64_t handle;
View
35 common/gdev_nvidia_mem.c
@@ -246,26 +246,49 @@ void gdev_mem_unmap(struct gdev_mem *mem)
}
}
-/* look up the memory object allocated at the specified address. */
-struct gdev_mem *gdev_mem_lookup(struct gdev_vas *vas, uint64_t addr, int type)
+/* look up a memory object associated with device virtual memory address. */
+struct gdev_mem *gdev_mem_lookup_by_addr(struct gdev_vas *vas, uint64_t addr, int type)
{
struct gdev_mem *mem = NULL;
unsigned long flags;
switch (type) {
- case (GDEV_MEM_DEVICE | GDEV_MEM_DMA):
+ case GDEV_MEM_DEVICE:
gdev_lock_save(&vas->lock, &flags);
gdev_list_for_each (mem, &vas->mem_list, list_entry_heap) {
- uint64_t map_addr = (uint64_t)mem->map;
- if ((addr >= map_addr) && (addr < map_addr + mem->size))
+ if ((addr >= mem->addr) && (addr < mem->addr + mem->size))
+ break;
+ }
+ gdev_unlock_restore(&vas->lock, &flags);
+ break;
+ case GDEV_MEM_DMA:
+ gdev_lock_save(&vas->lock, &flags);
+ gdev_list_for_each (mem, &vas->dma_mem_list, list_entry_heap) {
+ if ((addr >= mem->addr) && (addr < mem->addr + mem->size))
break;
}
gdev_unlock_restore(&vas->lock, &flags);
break;
+ default:
+ GDEV_PRINT("Memory type not supported\n");
+ }
+
+ return mem;
+}
+
+/* look up a memory object associated with host buffer address. */
+struct gdev_mem *gdev_mem_lookup_by_buf(struct gdev_vas *vas, const void *buf, int type)
+{
+ struct gdev_mem *mem = NULL;
+ uint64_t addr = (uint64_t)buf;
+ unsigned long flags;
+
+ switch (type) {
case GDEV_MEM_DEVICE:
gdev_lock_save(&vas->lock, &flags);
gdev_list_for_each (mem, &vas->mem_list, list_entry_heap) {
- if ((addr >= mem->addr) && (addr < mem->addr + mem->size))
+ uint64_t map_addr = (uint64_t)mem->map;
+ if ((addr >= map_addr) && (addr < map_addr + mem->size))
break;
}
gdev_unlock_restore(&vas->lock, &flags);
View
14 common/gdev_nvidia_shm.c
@@ -261,13 +261,21 @@ struct gdev_mem *gdev_shm_attach(struct gdev_vas *vas, struct gdev_mem *mem, uin
if (!mem) {
/* select a victim memory object. victim->shm will be newly
- allocated if NULL, with shm->users being incremented. */
+ allocated if NULL. */
if (!(mem = __gdev_shm_find_victim(vas, size)))
goto fail_victim;
implicit = 1;
}
- else
+ else {
+ if (!mem->shm) {
+ struct gdev_shm *shm;
+ if (!(shm = MALLOC(sizeof(*shm))))
+ goto fail_heap;
+ /* initialize shared memory, but don't add it to the list. */
+ __gdev_shm_init(mem, shm);
+ }
implicit = 0;
+ }
/* borrow the same (physical) memory space by sharing. */
if (!(new = gdev_raw_mem_share(vas, mem)))
@@ -303,6 +311,7 @@ struct gdev_mem *gdev_shm_attach(struct gdev_vas *vas, struct gdev_mem *mem, uin
gdev_raw_mem_unshare(new);
fail_shm:
mem->shm->users--;
+fail_heap:
fail_victim:
return NULL;
@@ -327,6 +336,7 @@ void gdev_shm_detach(struct gdev_mem *mem)
gdev_list_del(&mem->list_entry_shm);
if (shm->implicit)
__gdev_swap_detach(mem);
+
/* if the memory object is shared but no users, free it.
since users == 0, no one else will use mem->shm. */
if (shm->users == 0) {
View
58 cuda/driver_api/memory.c
@@ -245,11 +245,11 @@ CUresult cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, unsigned int B
*/
CUresult cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost, unsigned int ByteCount, CUstream hStream)
{
- Ghandle handle, handle_ref;
+ Ghandle handle, handle_r;
struct CUstream_st *stream = hStream;
const void *src_buf = srcHost;
uint64_t dst_addr = dstDevice;
- uint64_t dst_addr_ref;
+ uint64_t dst_addr_r, src_addr_r, src_addr;
uint32_t size = ByteCount;
struct gdev_cuda_fence *fence;
uint32_t id;
@@ -269,23 +269,36 @@ CUresult cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost, unsigned
return CUDA_ERROR_OUT_OF_MEMORY; /* this API shouldn't return it... */
handle = gdev_ctx_current->gdev_handle;
- handle_ref = stream->gdev_handle;
+ handle_r = stream->gdev_handle;
- if (!(dst_addr_ref = gref(handle, dst_addr, size, handle_ref)))
+ /* reference the device memory address. */
+ if (!(dst_addr_r = gref(handle, dst_addr, size, handle_r)))
goto fail_gref;
- if (gmemcpy_to_device_async(handle_ref, dst_addr_ref, src_buf, size, &id))
+ /* translate from buffer to address. */
+ if (!(src_addr = gvirtget(handle, src_buf)))
+ goto fail_gvirtget;
+
+ /* reference the host memory address. */
+ if (!(src_addr_r = gref(handle, src_addr, size, handle_r)))
+ goto fail_gref_dma;
+
+ /* now we can just copy data in the global address space. */
+ if (gmemcpy_async(handle_r, dst_addr_r, src_addr_r, size, &id))
goto fail_gmemcpy;
fence->id = id;
- fence->addr_ref = dst_addr_ref;
+ fence->addr_ref = dst_addr_r;
gdev_list_init(&fence->list_entry, fence);
gdev_list_add(&fence->list_entry, &stream->sync_list);
return CUDA_SUCCESS;
fail_gmemcpy:
- gunref(handle_ref, dst_addr_ref);
+ gunref(handle_r, src_addr_r);
+fail_gvirtget:
+fail_gref_dma:
+ gunref(handle_r, dst_addr_r);
fail_gref:
FREE(fence);
@@ -350,11 +363,11 @@ CUresult cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, unsigned int ByteCou
*/
CUresult cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream)
{
- Ghandle handle, handle_ref;
+ Ghandle handle, handle_r;
struct CUstream_st *stream = hStream;
void *dst_buf = dstHost;
uint64_t src_addr = srcDevice;
- uint64_t src_addr_ref;
+ uint64_t src_addr_r, dst_addr_r, dst_addr;
uint32_t size = ByteCount;
struct gdev_cuda_fence *fence;
uint32_t id;
@@ -374,23 +387,36 @@ CUresult cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice, unsigned int By
return CUDA_ERROR_OUT_OF_MEMORY; /* this API shouldn't return it... */
handle = gdev_ctx_current->gdev_handle;
- handle_ref = stream->gdev_handle;
+ handle_r = stream->gdev_handle;
- if (!(src_addr_ref = gref(handle, src_addr, size, handle_ref)))
+ /* reference the device memory address. */
+ if (!(src_addr_r = gref(handle, src_addr, size, handle_r)))
goto fail_gref;
-
- if (gmemcpy_from_device_async(handle_ref, dst_buf, src_addr_ref, size, &id))
+
+ /* translate from buffer to address. */
+ if (!(dst_addr = gvirtget(handle, dst_buf)))
+ goto fail_gvirtget;
+
+ /* reference the host memory address. */
+ if (!(dst_addr_r = gref(handle, dst_addr, size, handle_r)))
+ goto fail_gref_dma;
+
+ /* now we can just copy data in the global address space. */
+ if (gmemcpy_async(handle_r, dst_addr_r, src_addr_r, size, &id))
goto fail_gmemcpy;
fence->id = id;
- fence->addr_ref = src_addr_ref;
+ fence->addr_ref = src_addr_r;
gdev_list_init(&fence->list_entry, fence);
gdev_list_add(&fence->list_entry, &stream->sync_list);
return CUDA_SUCCESS;
fail_gmemcpy:
- gunref(handle_ref, src_addr_ref);
+ gunref(handle_r, dst_addr_r);
+fail_gref_dma:
+fail_gvirtget:
+ gunref(handle_r, src_addr_r);
fail_gref:
FREE(fence);
@@ -428,7 +454,7 @@ CUresult cuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, unsigned int
handle = gdev_ctx_current->gdev_handle;
- if (gmemcpy_in_device(handle, dst_addr, src_addr, size))
+ if (gmemcpy(handle, dst_addr, src_addr, size))
return CUDA_ERROR_UNKNOWN;
return CUDA_SUCCESS;
View
3 cuda/driver_api/stream.c
@@ -168,9 +168,6 @@ CUresult cuStreamSynchronize(CUstream hStream)
FREE(f);
}
- if (gbarrier(handle))
- return CUDA_ERROR_UNKNOWN;
-
return CUDA_SUCCESS;
}
View
3 driver/gdev/gdev_drv.c
@@ -539,7 +539,8 @@ EXPORT_SYMBOL(gmemcpy_from_device);
EXPORT_SYMBOL(gmemcpy_from_device_async);
EXPORT_SYMBOL(gmemcpy_user_from_device);
EXPORT_SYMBOL(gmemcpy_user_from_device_async);
-EXPORT_SYMBOL(gmemcpy_in_device);
+EXPORT_SYMBOL(gmemcpy);
+EXPORT_SYMBOL(gmemcpy_async);
EXPORT_SYMBOL(glaunch);
EXPORT_SYMBOL(gsync);
EXPORT_SYMBOL(gbarrier);
View
6 driver/gdev/gdev_drv_nvidia.c
@@ -307,7 +307,7 @@ struct gdev_mem *gdev_raw_mem_share(struct gdev_vas *vas, struct gdev_mem *mem)
struct gdev_drv_bo bo;
struct gdev_mem *new;
struct gdev_device *gdev = vas->gdev;
- struct drm_device *drm = (struct drm_device *) gdev->priv;
+ struct drm_device *drm = (struct drm_device *)gdev->priv;
if (!(new = kzalloc(sizeof(*new), GFP_KERNEL)))
goto fail_mem;
@@ -348,8 +348,8 @@ void gdev_raw_mem_unshare(struct gdev_mem *mem)
vspace.priv = vas->pvas;
bo.priv = mem->bo;
bo.addr = mem->addr;
- bo.size = mem->size;
- bo.map = mem->map;
+ bo.size = mem->size; /* not really used. */
+ bo.map = mem->map; /* not really used. */
gdev_drv_bo_unbind(&vspace, &bo);
kfree(mem);
View
6 driver/gdev/gdev_fops.c
@@ -107,8 +107,10 @@ static int gdev_ioctl
return gdev_ioctl_gmemcpy_from_device(handle, arg);
case GDEV_IOCTL_GMEMCPY_FROM_DEVICE_ASYNC:
return gdev_ioctl_gmemcpy_from_device_async(handle, arg);
- case GDEV_IOCTL_GMEMCPY_IN_DEVICE:
- return gdev_ioctl_gmemcpy_in_device(handle, arg);
+ case GDEV_IOCTL_GMEMCPY:
+ return gdev_ioctl_gmemcpy(handle, arg);
+ case GDEV_IOCTL_GMEMCPY_ASYNC:
+ return gdev_ioctl_gmemcpy_async(handle, arg);
case GDEV_IOCTL_GLAUNCH:
return gdev_ioctl_glaunch(handle, arg);
case GDEV_IOCTL_GSYNC:
View
23 driver/gdev/gdev_ioctl.c
@@ -306,14 +306,33 @@ int gdev_ioctl_gmemcpy_from_device_async(Ghandle handle, unsigned long arg)
return 0;
}
-int gdev_ioctl_gmemcpy_in_device(Ghandle handle, unsigned long arg)
+int gdev_ioctl_gmemcpy(Ghandle handle, unsigned long arg)
{
struct gdev_ioctl_dma dma;
if (copy_from_user(&dma, (void __user *)arg, sizeof(dma)))
return -EFAULT;
- return gmemcpy_in_device(handle, dma.dst_addr, dma.src_addr, dma.size);
+ return gmemcpy(handle, dma.dst_addr, dma.src_addr, dma.size);
+}
+
+int gdev_ioctl_gmemcpy_async(Ghandle handle, unsigned long arg)
+{
+ struct gdev_ioctl_dma dma;
+ int id;
+ int ret;
+
+ if (copy_from_user(&dma, (void __user *)arg, sizeof(dma)))
+ return -EFAULT;
+
+ ret = gmemcpy_async(handle, dma.dst_addr, dma.src_addr, dma.size, &id);
+ if (ret)
+ return ret;
+
+ if (copy_to_user((void __user *)dma.id, &id, sizeof(id)))
+ return -EFAULT;
+
+ return 0;
}
int gdev_ioctl_glaunch(Ghandle handle, unsigned long arg)
View
3 driver/gdev/gdev_ioctl.h
@@ -44,7 +44,8 @@ int gdev_ioctl_gmemcpy_to_device(Ghandle h, unsigned long arg);
int gdev_ioctl_gmemcpy_to_device_async(Ghandle h, unsigned long arg);
int gdev_ioctl_gmemcpy_from_device(Ghandle h, unsigned long arg);
int gdev_ioctl_gmemcpy_from_device_async(Ghandle h, unsigned long arg);
-int gdev_ioctl_gmemcpy_in_device(Ghandle h, unsigned long arg);
+int gdev_ioctl_gmemcpy(Ghandle h, unsigned long arg);
+int gdev_ioctl_gmemcpy_async(Ghandle h, unsigned long arg);
int gdev_ioctl_glaunch(Ghandle h, unsigned long arg);
int gdev_ioctl_gsync(Ghandle h, unsigned long arg);
int gdev_ioctl_gbarrier(Ghandle h, unsigned long arg);
View
2 driver/nouveau/drivers/gpu/drm/nouveau/gdev_interface.c
@@ -264,6 +264,7 @@ EXPORT_SYMBOL(gdev_drv_bo_bind);
int gdev_drv_bo_unbind(struct gdev_drv_vspace *drv_vspace, struct gdev_drv_bo *drv_bo)
{
+#if 0 /* this will crush the system... */
struct nouveau_channel *chan = (struct nouveau_channel *)drv_vspace->priv;
struct nouveau_bo *bo = (struct nouveau_bo *)drv_bo->priv;
struct nouveau_vma *vma;
@@ -275,6 +276,7 @@ int gdev_drv_bo_unbind(struct gdev_drv_vspace *drv_vspace, struct gdev_drv_bo *d
}
else
return -ENOENT;
+#endif
return 0;
}
View
130 lib/kernel/gdev_lib.c
@@ -49,6 +49,25 @@ struct gdev_handle {
struct gdev_list map_bo_list;
};
+/* return "OS-space" buffer address, if @addr is associated with DMA buffer. */
+static uint64_t __gdev_lookup_dma_buf(struct gdev_handle *h, uint64_t addr)
+{
+ struct gdev_map_bo *bo;
+ uint64_t buf_addr;
+
+ /* look up if @addr is associated with DMA buffer. */
+ gdev_list_for_each (bo, &h->map_bo_list, list_entry) {
+ buf_addr = (uint64_t)bo->buf;
+ if ((addr >= buf_addr) && (addr < buf_addr + bo->size))
+ break;
+ }
+
+ if (bo)
+ return bo->addr + addr - buf_addr;
+ else
+ return 0; /* means NULL. */
+}
+
struct gdev_handle *gopen(int minor)
{
char devname[32];
@@ -114,12 +133,12 @@ void *gmalloc_dma(struct gdev_handle *h, uint64_t size)
if (buf == MAP_FAILED)
goto fail_map;
- bo = (struct gdev_map_bo*) malloc(sizeof(*bo));
+ bo = (struct gdev_map_bo *)malloc(sizeof(*bo));
if (!bo)
goto fail_malloc;
gdev_list_init(&bo->list_entry, bo);
gdev_list_add(&bo->list_entry, &h->map_bo_list);
- bo->addr = mem.addr; /* buffer pointer address valid in OS-space */
+ bo->addr = mem.addr; /* "OS-space" buffer address */
bo->size = size; /* could be different from mem.size */
bo->buf = buf;
@@ -174,12 +193,12 @@ void *gmap(struct gdev_handle *h, uint64_t addr, uint64_t size)
if (buf == MAP_FAILED)
goto fail_map;
- bo = (struct gdev_map_bo*) malloc(sizeof(*bo));
+ bo = (struct gdev_map_bo *)malloc(sizeof(*bo));
if (!bo)
goto fail_malloc;
gdev_list_init(&bo->list_entry, bo);
gdev_list_add(&bo->list_entry, &h->map_bo_list);
- bo->addr = map.buf; /* buffer pointer address valid in OS-space */
+ bo->addr = map.buf; /* "OS-space" buffer address */
bo->size = size;
bo->buf = buf;
@@ -211,33 +230,28 @@ int gunmap(struct gdev_handle *h, void *buf)
munmap(buf, bo->size);
map.addr = 0; /* unused */
map.size = 0; /* unused */
- map.buf = bo->addr; /* bo->addr holds kernel-space buffer pointer */
+ map.buf = bo->addr; /* "OS-space" buffer address */
free(bo);
return ioctl(fd, GDEV_IOCTL_GUNMAP, &map);
}
static int __gmemcpy_to_device(struct gdev_handle *h, uint64_t dst_addr, const void *src_buf, uint64_t size, uint32_t *id, int ioctl_cmd)
{
- struct gdev_map_bo *bo;
struct gdev_ioctl_dma dma;
uint64_t src_addr = (uint64_t)src_buf;
- uint64_t buf_addr;
+ uint64_t dma_addr;
int fd = h->fd;
- /* look up if @src_buf is allocated on host DMA buffer already. */
- gdev_list_for_each (bo, &h->map_bo_list, list_entry) {
- buf_addr = (uint64_t)bo->buf;
- if ((src_addr >= buf_addr) && (src_addr < buf_addr + bo->size))
- break;
- }
+ /* look up if @src_buf is allocated on DMA buffer already. */
+ dma_addr = __gdev_lookup_dma_buf(h, src_addr);
dma.dst_addr = dst_addr;
- if (bo)
- /* this is "PCI-space" host address */
- dma.src_buf = (void *)(bo->addr + (src_addr - buf_addr));
+ if (dma_addr)
+ /* this is "OS-space" buffer address associated with DMA buffer. */
+ dma.src_buf = (void *)dma_addr;
else
- /* this is "user-space" buffer */
+ /* this is "user-space" buffer address. */
dma.src_buf = src_buf;
dma.size = size;
dma.id = id;
@@ -257,25 +271,20 @@ int gmemcpy_to_device_async(struct gdev_handle *h, uint64_t dst_addr, const void
static int __gmemcpy_from_device(struct gdev_handle *h, void *dst_buf, uint64_t src_addr, uint64_t size, uint32_t *id, int ioctl_cmd)
{
- struct gdev_map_bo *bo;
struct gdev_ioctl_dma dma;
uint64_t dst_addr = (uint64_t)dst_buf;
- uint64_t buf_addr;
+ uint64_t dma_addr;
int fd = h->fd;
- /* look up if @dst_buf is allocated on host DMA buffer already. */
- gdev_list_for_each (bo, &h->map_bo_list, list_entry) {
- buf_addr = (uint64_t)bo->buf;
- if ((dst_addr >= buf_addr) && (dst_addr < buf_addr + bo->size))
- break;
- }
+ /* look up if @dst_buf is allocated on DMA buffer already. */
+ dma_addr = __gdev_lookup_dma_buf(h, dst_addr);
dma.src_addr = src_addr;
- if (bo)
- /* this is "PCI-space" host address */
- dma.dst_buf = (void *)(bo->addr + (dst_addr - buf_addr));
+ if (dma_addr)
+ /* this is "OS-space" buffer address associated with DMA buffer. */
+ dma.dst_buf = (void *)dma_addr;
else
- /* this is "user-space" buffer */
+ /* this is "user-space" buffer address. */
dma.dst_buf = dst_buf;
dma.size = size;
dma.id = id;
@@ -293,16 +302,30 @@ int gmemcpy_from_device_async(struct gdev_handle *h, void *dst_buf, uint64_t src
return __gmemcpy_from_device(h, dst_buf, src_addr, size, id, GDEV_IOCTL_GMEMCPY_FROM_DEVICE_ASYNC);
}
-int gmemcpy_in_device(struct gdev_handle *h, uint64_t dst_addr, uint64_t src_addr, uint64_t size)
+int gmemcpy(struct gdev_handle *h, uint64_t dst_addr, uint64_t src_addr, uint64_t size)
{
struct gdev_ioctl_dma dma;
int fd = h->fd;
dma.dst_addr = dst_addr;
dma.src_addr = src_addr;
dma.size = size;
+ dma.id = NULL;
- return ioctl(fd, GDEV_IOCTL_GMEMCPY_IN_DEVICE, &dma);
+ return ioctl(fd, GDEV_IOCTL_GMEMCPY, &dma);
+}
+
+int gmemcpy_async(struct gdev_handle *h, uint64_t dst_addr, uint64_t src_addr, uint64_t size, uint32_t *id)
+{
+ struct gdev_ioctl_dma dma;
+ int fd = h->fd;
+
+ dma.dst_addr = dst_addr;
+ dma.src_addr = src_addr;
+ dma.size = size;
+ dma.id = id;
+
+ return ioctl(fd, GDEV_IOCTL_GMEMCPY_ASYNC, &dma);
}
int glaunch(struct gdev_handle *h, struct gdev_kernel *kernel, uint32_t *id)
@@ -423,14 +446,21 @@ uint64_t gref(struct gdev_handle *hmaster, uint64_t addr, uint64_t size, struct
{
struct gdev_ioctl_ref r;
struct gdev_ioctl_handle h;
+ uint64_t dma_addr;
int fd_master = hmaster->fd;
int fd_slave = hslave->fd;
int ret;
if ((ret = ioctl(fd_slave, GDEV_IOCTL_GET_HANDLE, &h)))
return ret;
- r.addr = addr;
+ /* lookup if @addr is associated with DMA buffer. */
+ dma_addr = __gdev_lookup_dma_buf(hmaster, addr);
+
+ if (dma_addr)
+ r.addr = dma_addr; /* "OS-space" buffer address */
+ else
+ r.addr = addr;
r.size = size;
r.handle_slave = h.handle;
if ((ret = ioctl(fd_master, GDEV_IOCTL_GREF, &r)))
@@ -452,47 +482,41 @@ int gunref(struct gdev_handle *h, uint64_t addr)
return 0;
}
-uint64_t gphysget(struct gdev_handle *h, void *p)
+uint64_t gphysget(struct gdev_handle *h, const void *p)
{
- struct gdev_map_bo *bo;
struct gdev_ioctl_phys phys;
int fd = h->fd;
uint64_t p_addr = (uint64_t)p;
- uint64_t buf_addr;
+ uint64_t dma_addr;
+
+ dma_addr = __gdev_lookup_dma_buf(h, p_addr);
+ if (dma_addr)
+ goto physget;
- gdev_list_for_each (bo, &h->map_bo_list, list_entry) {
- buf_addr = (uint64_t)bo->buf;
- if ((p_addr >= buf_addr) && (p_addr < buf_addr + bo->size))
- goto physget;
- }
return 0;
physget:
- /* bo->addr is buffer pointer address valid in OS-space. */
- phys.addr = bo->addr + (p_addr - buf_addr);
+ phys.addr = dma_addr; /* "OS-space" buffer address */
ioctl(fd, GDEV_IOCTL_GPHYSGET, &phys);
return phys.phys;
}
-uint64_t gvirtget(struct gdev_handle *h, void *p)
+uint64_t gvirtget(struct gdev_handle *h, const void *p)
{
- struct gdev_map_bo *bo;
struct gdev_ioctl_virt virt;
int fd = h->fd;
uint64_t p_addr = (uint64_t)p;
- uint64_t buf_addr;
+ uint64_t dma_addr;
+
+ dma_addr = __gdev_lookup_dma_buf(h, p_addr);
+ if (dma_addr)
+ goto virtget;
- gdev_list_for_each (bo, &h->map_bo_list, list_entry) {
- buf_addr = (uint64_t)bo->buf;
- if ((p_addr >= buf_addr) && (p_addr < buf_addr + bo->size))
- goto physget;
- }
return 0;
-physget:
- /* bo->addr is buffer pointer address valid in OS-space. */
- virt.addr = bo->addr + (p_addr - buf_addr);
+virtget:
+ virt.addr = dma_addr; /* "OS-space" buffer address */
ioctl(fd, GDEV_IOCTL_GVIRTGET, &virt);
return virt.virt;
View
87 test/cuda/common/memcpy_async.c
@@ -31,6 +31,7 @@ int cuda_test_memcpy_async(unsigned int size)
CUresult res;
CUdevice dev;
CUcontext ctx;
+ CUstream stream;
CUdeviceptr data_addr;
unsigned int *in, *out;
struct timeval tv;
@@ -41,13 +42,6 @@ int cuda_test_memcpy_async(unsigned int size)
struct timeval tv_d2h_start, tv_d2h_end;
float d2h;
- in = (unsigned int *) malloc(size);
- out = (unsigned int *) malloc(size);
- for (i = 0; i < size / 4; i++) {
- in[i] = i+1;
- out[i] = 0;
- }
-
gettimeofday(&tv_total_start, NULL);
res = cuInit(0);
@@ -68,27 +62,77 @@ int cuda_test_memcpy_async(unsigned int size)
return -1;
}
+ res = cuStreamCreate(&stream, 0);
+ if (res != CUDA_SUCCESS) {
+ printf("cuStreamCreate failed: res = %u\n", (unsigned int)res);
+ return -1;
+ }
+
res = cuMemAlloc(&data_addr, size);
if (res != CUDA_SUCCESS) {
printf("cuMemAlloc failed: res = %u\n", (unsigned int)res);
return -1;
}
- gettimeofday(&tv_h2d_start, NULL);
- res = cuMemcpyHtoDAsync(data_addr, in, size, 0);
- gettimeofday(&tv_h2d_end, NULL);
+ res = cuMemAllocHost((void **)&in, size);
+ if (res != CUDA_SUCCESS) {
+ printf("cuMemAllocHost(in) failed: res = %u\n", (unsigned int)res);
+ return -1;
+ }
+ res = cuMemAllocHost((void **)&out, size);
+ if (res != CUDA_SUCCESS) {
+ printf("cuMemAllocHost(out) failed: res = %u\n", (unsigned int)res);
+ return -1;
+ }
+
+ for (i = 0; i < size / 4; i++) {
+ in[i] = i+1;
+ out[i] = 0;
+ }
+
+ gettimeofday(&tv_h2d_start, NULL);
+ res = cuMemcpyHtoDAsync(data_addr, in, size, stream);
if (res != CUDA_SUCCESS) {
printf("cuMemcpyHtoDAsync failed: res = %u\n", (unsigned int)res);
return -1;
}
+ res = cuStreamSynchronize(stream);
+ if (res != CUDA_SUCCESS) {
+ printf("cuStreamSynchronize() failed: res = %u\n", (unsigned int)res);
+ return -1;
+ }
+ gettimeofday(&tv_h2d_end, NULL);
gettimeofday(&tv_d2h_start, NULL);
- res = cuMemcpyDtoHAsync(out, data_addr, size, 0);
+ res = cuMemcpyDtoHAsync(out, data_addr, size, stream);
+ if (res != CUDA_SUCCESS) {
+ printf("cuMemcpyDtoHAsync failed: res = %u\n", (unsigned int)res);
+ return -1;
+ }
+ res = cuStreamSynchronize(stream);
+ if (res != CUDA_SUCCESS) {
+ printf("cuStreamSynchronize() failed: res = %u\n", (unsigned int)res);
+ return -1;
+ }
gettimeofday(&tv_d2h_end, NULL);
+ for (i = 0; i < size / 4; i++) {
+ if (in[i] != out[i]) {
+ printf("in[%d] = %u, out[%d] = %u\n",
+ i, in[i], i, out[i]);
+ }
+ }
+
+ res = cuMemFreeHost(out);
if (res != CUDA_SUCCESS) {
- printf("cuMemcpyDtoHAsync failed: res = %u\n", (unsigned int)res);
+ printf("cuMemFreeHost(out) failed: res = %u\n", (unsigned int)res);
+ return -1;
+ }
+
+ res = cuMemFreeHost(in);
+ if (res != CUDA_SUCCESS) {
+ printf("cuMemFreeHost(in) failed: res = %u\n", (unsigned int)res);
return -1;
}
@@ -98,6 +142,12 @@ int cuda_test_memcpy_async(unsigned int size)
return -1;
}
+ res = cuStreamDestroy(stream);
+ if (res != CUDA_SUCCESS) {
+ printf("cuStreamDestroy failed: res = %u\n", (unsigned int)res);
+ return -1;
+ }
+
res = cuCtxDestroy(ctx);
if (res != CUDA_SUCCESS) {
printf("cuCtxDestroy failed: res = %u\n", (unsigned int)res);
@@ -106,17 +156,6 @@ int cuda_test_memcpy_async(unsigned int size)
gettimeofday(&tv_total_end, NULL);
- for (i = 0; i < size / 4; i++) {
- if (in[i] != out[i]) {
- printf("in[%d] = %u, out[%d] = %u\n",
- i, in[i], i, out[i]);
- goto end;
- }
- }
-
- free(in);
- free(out);
-
tvsub(&tv_h2d_end, &tv_h2d_start, &tv);
h2d = tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
tvsub(&tv_d2h_end, &tv_d2h_start, &tv);
@@ -130,8 +169,6 @@ int cuda_test_memcpy_async(unsigned int size)
return 0;
end:
- free(in);
- free(out);
return -1;
}

0 comments on commit 9fd7f4b

Please sign in to comment.