Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

gdev: fixed asynchronous memcpy functions

pscnv: added chan_alloc()s and chan_kill()s for pcopy0 and 1 in gdev_interface
nouveau: added context_new()s and context_del()s for pcopy0 and 1 in gdev_interface
  • Loading branch information...
commit 6a30a05185d6086f1c00e588037ea8bcc46444ad 1 parent 3820d76
Shinpei Kato authored
View
28 common/gdev_api.c
@@ -192,12 +192,16 @@ static int __gmemcpy_dma_to_device(gdev_ctx_t *ctx, uint64_t dst_addr, uint64_t
{
uint32_t fence;
- /* we don't break data into chunks if copying directly from dma memory. */
- fence = gdev_memcpy(ctx, dst_addr, src_addr, size);
- if (!id)
+ /* we don't break data into chunks if copying directly from dma memory.
+ if @id == NULL, it means memcpy is synchronous. */
+ if (!id) {
+ fence = gdev_memcpy(ctx, dst_addr, src_addr, size);
gdev_poll(ctx, fence, NULL);
- else
+ }
+ else {
+ fence = gdev_memcpy_async(ctx, dst_addr, src_addr, size);
*id = fence;
+ }
return 0;
}
@@ -369,9 +373,9 @@ static int __gmemcpy_from_device_np(gdev_ctx_t *ctx, void *dst_buf, uint64_t src
offset = 0;
while (rest_size) {
dma_size = __min(rest_size, ch_size);
- fence = gdev_memcpy(ctx, dma_addr[0], src_addr+offset, dma_size);
+ fence = gdev_memcpy(ctx, dma_addr[0], src_addr + offset, dma_size);
gdev_poll(ctx, fence, NULL);
- ret = host_copy(dst_buf+offset, dma_buf[0], dma_size);
+ ret = host_copy(dst_buf + offset, dma_buf[0], dma_size);
if (ret)
goto end;
rest_size -= dma_size;
@@ -389,12 +393,16 @@ static int __gmemcpy_dma_from_device(gdev_ctx_t *ctx, uint64_t dst_addr, uint64_
{
uint32_t fence;
- /* we don't break data into chunks if copying directly from dma memory. */
- fence = gdev_memcpy(ctx, dst_addr, src_addr, size);
- if (!id)
+ /* we don't break data into chunks if copying directly from dma memory.
+ if @id == NULL, it means memcpy is synchronous. */
+ if (!id) {
+ fence = gdev_memcpy(ctx, dst_addr, src_addr, size);
gdev_poll(ctx, fence, NULL);
- else
+ }
+ else {
+ fence = gdev_memcpy_async(ctx, dst_addr, src_addr, size);
*id = fence;
+ }
return 0;
}
View
1  common/gdev_arch.h
@@ -47,6 +47,7 @@ typedef struct gdev_mem gdev_mem_t;
int gdev_compute_setup(struct gdev_device *gdev);
uint32_t gdev_launch(gdev_ctx_t *ctx, struct gdev_kernel *kern);
uint32_t gdev_memcpy(gdev_ctx_t *ctx, uint64_t dst_addr, uint64_t src_addr, uint32_t size);
+uint32_t gdev_memcpy_async(gdev_ctx_t *ctx, uint64_t dst_addr, uint64_t src_addr, uint32_t size);
uint32_t gdev_read32(gdev_mem_t *mem, uint64_t addr);
void gdev_write32(gdev_mem_t *mem, uint64_t addr, uint32_t val);
int gdev_read(gdev_mem_t *mem, void *buf, uint64_t addr, uint32_t size);
View
7 common/gdev_device.h
@@ -39,10 +39,11 @@
#define GDEV_PHYSICAL_DEVICE_MAX_COUNT 8
/**
- * generic subchannel definitions
+ * generic operation definitions
*/
-#define GDEV_SUBCH_COMPUTE 1
-#define GDEV_SUBCH_MEMCPY 2
+#define GDEV_OP_COMPUTE 1
+#define GDEV_OP_MEMCPY 2
+#define GDEV_OP_MEMCPY_ASYNC 3
/**
* Gdev device struct:
View
2  common/gdev_nvidia.c
@@ -107,7 +107,7 @@ struct gdev_ctx *gdev_ctx_new(struct gdev_device *gdev, struct gdev_vas *vas)
ctx->vas = vas;
- /* initialize the channel. */
+ /* initialize the compute-related objects. this must follow ctx_new(). */
compute->init(ctx);
return ctx;
View
18 common/gdev_nvidia.h
@@ -36,19 +36,12 @@
#include "gdev_time.h"
/**
- * MRQ requires compute and memcpy to be overlapped - hence PCOPY.
+ * subchannel IDs == generic operation IDs.
*/
-#ifdef GDEV_SCHED_MRQ
-#define GDEV_NVIDIA_MEMCPY_PCOPY
-#endif
-
-#define GDEV_SUBCH_NV_COMPUTE GDEV_SUBCH_COMPUTE
-#ifndef GDEV_NVIDIA_MEMCPY_PCOPY
-#define GDEV_SUBCH_NV_M2MF GDEV_SUBCH_MEMCPY
-#else
-#define GDEV_SUBCH_NV_PCOPY0 GDEV_SUBCH_MEMCPY
-#endif
-#define GDEV_SUBCH_NV_PCOPY1 (GDEV_SUBCH_MEMCPY + 1)
+#define GDEV_SUBCH_NV_COMPUTE GDEV_OP_COMPUTE /* 1 */
+#define GDEV_SUBCH_NV_M2MF GDEV_OP_MEMCPY /* 2 */
+#define GDEV_SUBCH_NV_PCOPY0 GDEV_OP_MEMCPY_ASYNC /* 3 */
+#define GDEV_SUBCH_NV_PCOPY1 (GDEV_SUBCH_NV_PCOPY0 + 1) /* 4 */
#define GDEV_FENCE_BUF_SIZE 0x10000 /* 64KB */
#define GDEV_FENCE_QUERY_SIZE 0x10 /* aligned with nvc0's query */
@@ -199,6 +192,7 @@ struct gdev_compute {
void (*fence_write)(struct gdev_ctx *, int, uint32_t);
void (*fence_reset)(struct gdev_ctx *, uint32_t);
void (*memcpy)(struct gdev_ctx *, uint64_t, uint64_t, uint32_t);
+ void (*memcpy_async)(struct gdev_ctx *, uint64_t, uint64_t, uint32_t);
void (*membar)(struct gdev_ctx *);
void (*notify_intr)(struct gdev_ctx *);
void (*init)(struct gdev_ctx *);
View
32 common/gdev_nvidia_compute.c
@@ -78,7 +78,7 @@ uint32_t gdev_launch(struct gdev_ctx *ctx, struct gdev_kernel *kern)
explicitly after the kernel is launched. */
compute->fence_reset(ctx, seq);
compute->launch(ctx, kern);
- compute->fence_write(ctx, GDEV_SUBCH_COMPUTE, seq);
+ compute->fence_write(ctx, GDEV_OP_COMPUTE, seq);
#ifndef GDEV_SCHED_DISABLED
/* set an interrupt to be caused when compute done. */
@@ -88,7 +88,7 @@ uint32_t gdev_launch(struct gdev_ctx *ctx, struct gdev_kernel *kern)
return seq;
}
-/* asynchrounously copy data of @size from @src_addr to @dst_addr. */
+/* copy data of @size from @src_addr to @dst_addr. */
uint32_t gdev_memcpy(struct gdev_ctx *ctx, uint64_t dst_addr, uint64_t src_addr, uint32_t size)
{
struct gdev_vas *vas = ctx->vas;
@@ -106,12 +106,36 @@ uint32_t gdev_memcpy(struct gdev_ctx *ctx, uint64_t dst_addr, uint64_t src_addr,
the QUERY method, i.e., if QUERY is set, the sequence will be
written to the specified address when the data are transfered. */
compute->fence_reset(ctx, seq);
- compute->fence_write(ctx, GDEV_SUBCH_MEMCPY, seq);
+ compute->fence_write(ctx, GDEV_OP_MEMCPY /* == M2MF */, seq);
compute->memcpy(ctx, dst_addr, src_addr, size);
return seq;
}
+/* asynchronously copy data of @size from @src_addr to @dst_addr. */
+uint32_t gdev_memcpy_async(struct gdev_ctx *ctx, uint64_t dst_addr, uint64_t src_addr, uint32_t size)
+{
+ struct gdev_vas *vas = ctx->vas;
+ struct gdev_device *gdev = vas->gdev;
+ struct gdev_compute *compute = gdev->compute;
+ uint32_t seq;
+
+ if (++ctx->fence.seq == GDEV_FENCE_COUNT)
+ ctx->fence.seq = 1;
+ seq = ctx->fence.seq;
+
+ compute->membar(ctx);
+ /* it's important to emit a fence *before* memcpy():
+ the EXEC method of the PCOPY and M2MF engines is associated with
+ the QUERY method, i.e., if QUERY is set, the sequence will be
+ written to the specified address when the data are transfered. */
+ compute->fence_reset(ctx, seq);
+ compute->fence_write(ctx, GDEV_OP_MEMCPY_ASYNC /* == PCOPY0 */, seq);
+ compute->memcpy_async(ctx, dst_addr, src_addr, size);
+
+ return seq;
+}
+
/* read 32-bit value from @addr. */
uint32_t gdev_read32(struct gdev_mem *mem, uint64_t addr)
{
@@ -173,7 +197,7 @@ int gdev_barrier(struct gdev_ctx *ctx)
uint32_t seq = 0; /* 0 is a special sequence for barrier. */
compute->membar(ctx);
- compute->fence_write(ctx, GDEV_SUBCH_COMPUTE, seq);
+ compute->fence_write(ctx, GDEV_OP_COMPUTE, seq);
while (seq != compute->fence_read(ctx, seq));
return 0;
View
25 common/gdev_nvidia_nvc0.c
@@ -252,20 +252,25 @@ static void nvc0_fence_write(struct gdev_ctx *ctx, int subch, uint32_t sequence)
__gdev_out_ring(ctx, sequence); /* QUERY_SEQUENCE */
__gdev_out_ring(ctx, intr << 20); /* QUERY_GET */
break;
-#ifndef GDEV_NVIDIA_MEMCPY_PCOPY
case GDEV_SUBCH_NV_M2MF:
__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_M2MF, 0x32c, 3);
__gdev_out_ring(ctx, vm_addr >> 32); /* QUERY_ADDRESS HIGH */
__gdev_out_ring(ctx, vm_addr); /* QUERY_ADDRESS LOW */
__gdev_out_ring(ctx, sequence); /* QUERY_SEQUENCE */
break;
-#else
case GDEV_SUBCH_NV_PCOPY0:
__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_PCOPY0, 0x338, 3);
__gdev_out_ring(ctx, vm_addr >> 32); /* QUERY_ADDRESS HIGH */
__gdev_out_ring(ctx, vm_addr); /* QUERY_ADDRESS LOW */
__gdev_out_ring(ctx, sequence); /* QUERY_COUNTER */
break;
+#ifdef GDEV_NVIDIA_USE_PCOPY1
+ case GDEV_SUBCH_NV_PCOPY1:
+ __gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_PCOPY1, 0x338, 3);
+ __gdev_out_ring(ctx, vm_addr >> 32); /* QUERY_ADDRESS HIGH */
+ __gdev_out_ring(ctx, vm_addr); /* QUERY_ADDRESS LOW */
+ __gdev_out_ring(ctx, sequence); /* QUERY_COUNTER */
+ break;
#endif
}
@@ -277,7 +282,6 @@ static void nvc0_fence_reset(struct gdev_ctx *ctx, uint32_t sequence)
((struct gdev_nvc0_query*)(ctx->fence.map))[sequence].sequence = ~0;
}
-#ifndef GDEV_NVIDIA_MEMCPY_PCOPY
static void nvc0_memcpy_m2mf(struct gdev_ctx *ctx, uint64_t dst_addr, uint64_t src_addr, uint32_t size)
{
uint32_t mode1 = 0x102110; /* QUERY_SHORT|QUERY_YES|SRC_LINEAR|DST_LINEAR */
@@ -325,7 +329,7 @@ static void nvc0_memcpy_m2mf(struct gdev_ctx *ctx, uint64_t dst_addr, uint64_t s
__gdev_fire_ring(ctx);
}
-#else
+
static void nvc0_memcpy_pcopy0(struct gdev_ctx *ctx, uint64_t dst_addr, uint64_t src_addr, uint32_t size)
{
uint32_t mode = 0x3110; /* QUERY_SHORT|QUERY|SRC_LINEAR|DST_LINEAR */
@@ -370,7 +374,6 @@ static void nvc0_memcpy_pcopy0(struct gdev_ctx *ctx, uint64_t dst_addr, uint64_t
__gdev_fire_ring(ctx);
}
}
-#endif
static void nvc0_membar(struct gdev_ctx *ctx)
{
@@ -415,14 +418,13 @@ static void nvc0_init(struct gdev_ctx *ctx)
/* setup subchannels. */
__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_COMPUTE, 0, 1);
__gdev_out_ring(ctx, 0x90c0); /* COMPUTE */
-#ifdef GDEV_NVIDIA_MEMCPY_PCOPY
+ __gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_M2MF, 0, 1);
+ __gdev_out_ring(ctx, 0x9039); /* M2MF */
__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_PCOPY0, 0, 1);
__gdev_out_ring(ctx, 0x490b5); /* PCOPY0 */
+#ifdef GDEV_NVIDIA_USE_PCOPY1
__gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_PCOPY1, 0, 1);
__gdev_out_ring(ctx, 0x590b8 /* 0x590b5 */); /* PCOPY1 */
-#else
- __gdev_begin_ring_nvc0(ctx, GDEV_SUBCH_NV_M2MF, 0, 1);
- __gdev_out_ring(ctx, 0x9039); /* M2MF */
#endif
__gdev_fire_ring(ctx);
}
@@ -487,11 +489,8 @@ static struct gdev_compute gdev_compute_nvc0 = {
.fence_read = nvc0_fence_read,
.fence_write = nvc0_fence_write,
.fence_reset = nvc0_fence_reset,
-#ifndef GDEV_NVIDIA_MEMCPY_PCOPY
.memcpy = nvc0_memcpy_m2mf,
-#else
- .memcpy = nvc0_memcpy_pcopy0,
-#endif
+ .memcpy_async = nvc0_memcpy_pcopy0,
.membar = nvc0_membar,
.notify_intr = nvc0_notify_intr,
.init = nvc0_init,
View
15 driver/gdev/gdev_drv.c
@@ -56,11 +56,9 @@ static dev_t dev;
static struct cdev *cdevs; /* character devices for virtual devices */
/**
- * pointers to callback functions.
+ * interrupt notify handler
*/
-void (*gdev_callback_notify)(int subc, uint32_t data);
-
-static void __gdev_notify_handler(int subc, uint32_t data)
+static void __gdev_notify_handler(int op, uint32_t data)
{
struct gdev_device *gdev;
struct gdev_sched_entity *se;
@@ -69,15 +67,16 @@ static void __gdev_notify_handler(int subc, uint32_t data)
if (cid < GDEV_CONTEXT_MAX_COUNT) {
se = sched_entity_ptr[cid];
gdev = se->gdev;
- switch (subc) {
- case GDEV_SUBCH_COMPUTE:
+ switch (op) {
+ case GDEV_OP_COMPUTE:
+ case GDEV_OP_MEMCPY: /* memcpy is synchronous with compute */
wake_up_process(gdev->sched_com_thread);
break;
- case GDEV_SUBCH_MEMCPY:
+ case GDEV_OP_MEMCPY_ASYNC:
wake_up_process(gdev->sched_mem_thread);
break;
default:
- GDEV_PRINT("Unknown subchannel %d\n", subc);
+ GDEV_PRINT("Unknown operation %d\n", op);
}
}
else
View
5 driver/gdev/gdev_drv.h
@@ -45,9 +45,4 @@ struct gdev_mutex {
struct mutex mutex;
};
-/**
- * export callback function.
- */
-extern void (*gdev_callback_notify)(int subc, uint32_t data);
-
#endif
View
19 driver/nouveau/drivers/gpu/drm/nouveau/gdev_interface.c
@@ -71,10 +71,17 @@ int gdev_drv_chan_alloc(struct drm_device *drm, struct gdev_drv_vspace *drv_vspa
/* FIFO init: it has already been done in gdev_vas_new(). */
- /* FIFO command queue registers. */
switch (dev_priv->chipset & 0xf0) {
case 0xc0:
+ /* FIFO command queue registers. */
regs = chan->user;
+ /* PCOPY engines. */
+ ret = dev_priv->eng[NVOBJ_ENGINE_COPY0]->context_new(chan, NVOBJ_ENGINE_COPY0);
+ if (ret)
+ goto fail_pcopy0;
+ ret = dev_priv->eng[NVOBJ_ENGINE_COPY1]->context_new(chan, NVOBJ_ENGINE_COPY1);
+ if (ret)
+ goto fail_pcopy1;
break;
default:
ret = -EINVAL;
@@ -99,13 +106,21 @@ int gdev_drv_chan_alloc(struct drm_device *drm, struct gdev_drv_vspace *drv_vspa
return 0;
fail_fifo_reg:
+fail_pcopy1:
+ dev_priv->eng[NVOBJ_ENGINE_COPY0]->context_del(chan, NVOBJ_ENGINE_COPY0);
+fail_pcopy0:
return ret;
}
EXPORT_SYMBOL(gdev_drv_chan_alloc);
int gdev_drv_chan_free(struct gdev_drv_vspace *drv_vspace, struct gdev_drv_chan *drv_chan)
{
- /* really nothing to do. */
+ struct nouveau_channel *chan = (struct nouveau_channel *)drv_vspace->priv;
+ struct drm_nouveau_private *dev_priv = chan->dev->dev_private;
+
+ dev_priv->eng[NVOBJ_ENGINE_COPY1]->context_del(chan, NVOBJ_ENGINE_COPY1);
+ dev_priv->eng[NVOBJ_ENGINE_COPY0]->context_del(chan, NVOBJ_ENGINE_COPY0);
+
return 0;
}
EXPORT_SYMBOL(gdev_drv_chan_free);
View
16 driver/pscnv/gdev_interface.c
@@ -98,10 +98,17 @@ int gdev_drv_chan_alloc(struct drm_device *drm, struct gdev_drv_vspace *drv_vspa
if (ret)
goto fail_fifo_init;
- /* FIFO command queue registers. */
switch (dev_priv->chipset & 0xf0) {
case 0xc0:
+ /* FIFO command queue registers. */
regs = nvc0_fifo_ctrl_ptr(drm, chan);
+ /* PCOPY engines. */
+ ret = dev_priv->engines[PSCNV_ENGINE_COPY0]->chan_alloc(dev_priv->engines[PSCNV_ENGINE_COPY0], chan);
+ if (ret)
+ goto fail_pcopy0;
+ ret = dev_priv->engines[PSCNV_ENGINE_COPY1]->chan_alloc(dev_priv->engines[PSCNV_ENGINE_COPY1], chan);
+ if (ret)
+ goto fail_pcopy1;
break;
default:
ret = -EINVAL;
@@ -126,6 +133,9 @@ int gdev_drv_chan_alloc(struct drm_device *drm, struct gdev_drv_vspace *drv_vspa
return 0;
fail_fifo_reg:
+fail_pcopy1:
+ dev_priv->engines[PSCNV_ENGINE_COPY0]->chan_kill(dev_priv->engines[PSCNV_ENGINE_COPY0], chan);
+fail_pcopy0:
fail_fifo_init:
vunmap(pb_map);
pscnv_vspace_unmap(vspace, pb_mm->start);
@@ -149,11 +159,15 @@ int gdev_drv_chan_free(struct gdev_drv_vspace *drv_vspace, struct gdev_drv_chan
struct pscnv_chan *chan = (struct pscnv_chan *)drv_chan->priv;
struct pscnv_bo *ib_bo = (struct pscnv_bo *)drv_chan->ib_bo;
struct pscnv_bo *pb_bo = (struct pscnv_bo *)drv_chan->pb_bo;
+ struct drm_nouveau_private *dev_priv = chan->dev->dev_private;
uint32_t *ib_map = drv_chan->ib_map;
uint32_t *pb_map = drv_chan->pb_map;
uint64_t ib_base = drv_chan->ib_base;
uint64_t pb_base = drv_chan->pb_base;
+ dev_priv->engines[PSCNV_ENGINE_COPY0]->chan_kill(dev_priv->engines[PSCNV_ENGINE_COPY0], chan);
+ dev_priv->engines[PSCNV_ENGINE_COPY1]->chan_kill(dev_priv->engines[PSCNV_ENGINE_COPY1], chan);
+
vunmap(pb_map);
pscnv_vspace_unmap(vspace, pb_base);
pscnv_mem_free(pb_bo);
View
12 test/cuda/common/memcpy_async.c
@@ -37,9 +37,9 @@ int cuda_test_memcpy_async(unsigned int size)
struct timeval tv_total_start, tv_total_end;
unsigned long total;
struct timeval tv_h2d_start, tv_h2d_end;
- unsigned long h2d;
+ float h2d;
struct timeval tv_d2h_start, tv_d2h_end;
- unsigned long d2h;
+ float d2h;
in = (unsigned int *) malloc(size);
out = (unsigned int *) malloc(size);
@@ -118,14 +118,14 @@ int cuda_test_memcpy_async(unsigned int size)
free(out);
tvsub(&tv_h2d_end, &tv_h2d_start, &tv);
- h2d = tv.tv_sec * 1000 + tv.tv_usec / 1000;
+ h2d = tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
tvsub(&tv_d2h_end, &tv_d2h_start, &tv);
- d2h = tv.tv_sec * 1000 + tv.tv_usec / 1000;
+ d2h = tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
tvsub(&tv_total_end, &tv_total_start, &tv);
total = tv.tv_sec * 1000 + tv.tv_usec / 1000;
- printf("HtoD: %lu\n", h2d);
- printf("DtoH: %lu\n", d2h);
+ printf("HtoD: %f\n", h2d);
+ printf("DtoH: %f\n", d2h);
return 0;
Please sign in to comment.
Something went wrong with that request. Please try again.