Skip to content

Commit

Permalink
radeonsi: add SI_CONTEXT_PFP_SYNC_ME to skip syncing PFP for image op…
Browse files Browse the repository at this point in the history
…erations

DCC/CMASK/HTILE clears will not set this. We could do a better job
at not setting this in other cases too

Image copies also don't set this.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9795>
  • Loading branch information
Marek Olšák authored and Marge Bot committed Apr 2, 2021
1 parent 4fb1b7b commit c532616
Show file tree
Hide file tree
Showing 8 changed files with 35 additions and 29 deletions.
5 changes: 4 additions & 1 deletion src/gallium/drivers/radeonsi/si_compute_blit.c
Expand Up @@ -70,6 +70,9 @@ void si_launch_grid_internal(struct si_context *sctx, struct pipe_grid_info *inf
if (flags & SI_OP_SYNC_CS_BEFORE)
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;

if (!(flags & SI_OP_CS_IMAGE))
sctx->flags |= SI_CONTEXT_PFP_SYNC_ME;

/* Invalidate L0-L1 caches. */
/* sL0 is never invalidated, because src resources don't use it. */
if (!(flags & SI_OP_SKIP_CACHE_INV_BEFORE))
Expand Down Expand Up @@ -107,7 +110,7 @@ void si_launch_grid_internal(struct si_context *sctx, struct pipe_grid_info *inf
sctx->flags |= SI_CONTEXT_INV_VCACHE;
} else {
/* Make sure buffer stores are visible to all CUs. */
sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE;
sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE | SI_CONTEXT_PFP_SYNC_ME;
}
}
}
Expand Down
8 changes: 4 additions & 4 deletions src/gallium/drivers/radeonsi/si_cp_dma.c
Expand Up @@ -197,10 +197,10 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,
assert(size && size % 4 == 0);

if (user_flags & SI_OP_SYNC_CS_BEFORE)
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME;

if (user_flags & SI_OP_SYNC_PS_BEFORE)
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME;

/* Mark the buffer range of destination as valid (initialized),
* so that transfer_map knows it should wait for the GPU when mapping
Expand Down Expand Up @@ -340,10 +340,10 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
}

if (user_flags & SI_OP_SYNC_CS_BEFORE)
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME;

if (user_flags & SI_OP_SYNC_PS_BEFORE)
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME;

if ((dst || src) && !(user_flags & SI_OP_SKIP_CACHE_INV_BEFORE))
sctx->flags |= si_get_flush_flags(sctx, coher, cache_policy);
Expand Down
32 changes: 15 additions & 17 deletions src/gallium/drivers/radeonsi/si_gfx_cs.c
Expand Up @@ -569,6 +569,8 @@ void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs, uns

assert(sctx->chip_class <= GFX9);

cp_coher_cntl |= 1u << 31; /* don't sync PFP, i.e. execute the sync in ME */

radeon_begin(cs);

if (sctx->chip_class == GFX9 || compute_ib) {
Expand Down Expand Up @@ -749,21 +751,22 @@ void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs)

/* Ignore fields that only modify the behavior of other fields. */
if (gcr_cntl & C_586_GL1_RANGE & C_586_GL2_RANGE & C_586_SEQ) {
unsigned dont_sync_pfp = (!(flags & SI_CONTEXT_PFP_SYNC_ME)) << 31;

/* Flush caches and wait for the caches to assert idle.
* The cache flush is executed in the ME, but the PFP waits
* for completion.
*/
radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0));
radeon_emit(cs, 0); /* CP_COHER_CNTL */
radeon_emit(cs, dont_sync_pfp); /* CP_COHER_CNTL */
radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
radeon_emit(cs, 0xffffff); /* CP_COHER_SIZE_HI */
radeon_emit(cs, 0); /* CP_COHER_BASE */
radeon_emit(cs, 0); /* CP_COHER_BASE_HI */
radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
radeon_emit(cs, gcr_cntl); /* GCR_CNTL */
} else if (cb_db_event || (flags & (SI_CONTEXT_VS_PARTIAL_FLUSH | SI_CONTEXT_PS_PARTIAL_FLUSH |
SI_CONTEXT_CS_PARTIAL_FLUSH))) {
/* We need to ensure that PFP waits as well. */
} else if (flags & SI_CONTEXT_PFP_SYNC_ME) {
/* Synchronize PFP with ME. (this stalls PFP) */
radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
radeon_emit(cs, 0);
}
Expand Down Expand Up @@ -953,23 +956,11 @@ void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs)
si_cp_wait_mem(sctx, cs, va, sctx->wait_mem_number, 0xffffffff, WAIT_REG_MEM_EQUAL);
}

/* Make sure ME is idle (it executes most packets) before continuing.
* This prevents read-after-write hazards between PFP and ME.
*/
if (sctx->has_graphics &&
(cp_coher_cntl || (flags & (SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_INV_VCACHE |
SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2)))) {
radeon_begin(cs);
radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
radeon_emit(cs, 0);
radeon_end();
}

/* GFX6-GFX8 only:
* When one of the CP_COHER_CNTL.DEST_BASE flags is set, SURFACE_SYNC
* waits for idle, so it should be last. SURFACE_SYNC is done in PFP.
*
* cp_coher_cntl should contain all necessary flags except TC flags
* cp_coher_cntl should contain all necessary flags except TC and PFP flags
* at this point.
*
* GFX6-GFX7 don't support L2 write-back.
Expand Down Expand Up @@ -1011,6 +1002,13 @@ void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs)
if (cp_coher_cntl)
si_emit_surface_sync(sctx, cs, cp_coher_cntl);

if (flags & SI_CONTEXT_PFP_SYNC_ME) {
radeon_begin(cs);
radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
radeon_emit(cs, 0);
radeon_end();
}

if (is_barrier)
si_prim_discard_signal_next_compute_ib_start(sctx);

Expand Down
3 changes: 3 additions & 0 deletions src/gallium/drivers/radeonsi/si_pipe.h
Expand Up @@ -98,6 +98,9 @@ extern "C" {
#define SI_CONTEXT_CS_PARTIAL_FLUSH (1 << 14)
#define SI_CONTEXT_VGT_FLUSH (1 << 15)
#define SI_CONTEXT_VGT_STREAMOUT_SYNC (1 << 16)
/* PFP waits for ME to finish. Used to sync for index and indirect buffers and render
* condition. It's typically set when doing a VS/PS/CS partial flush for buffers. */
#define SI_CONTEXT_PFP_SYNC_ME (1 << 17)

#define SI_PREFETCH_LS (1 << 1)
#define SI_PREFETCH_HS (1 << 2)
Expand Down
2 changes: 1 addition & 1 deletion src/gallium/drivers/radeonsi/si_query.c
Expand Up @@ -1582,7 +1582,7 @@ static void si_query_hw_get_result_resource(struct si_context *sctx, struct si_q
}

sctx->b.launch_grid(&sctx->b, &grid);
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME;
}

si_restore_qbo_state(sctx, &saved_state);
Expand Down
4 changes: 2 additions & 2 deletions src/gallium/drivers/radeonsi/si_sqtt.c
Expand Up @@ -383,7 +383,7 @@ si_thread_trace_start(struct si_context *sctx, int family, struct radeon_cmdbuf
sctx->flags |=
SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
SI_CONTEXT_INV_L2;
SI_CONTEXT_INV_L2 | SI_CONTEXT_PFP_SYNC_ME;
sctx->emit_cache_flush(sctx, cs);

si_inhibit_clockgating(sctx, cs, true);
Expand Down Expand Up @@ -426,7 +426,7 @@ si_thread_trace_stop(struct si_context *sctx, int family, struct radeon_cmdbuf *
sctx->flags |=
SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
SI_CONTEXT_INV_L2;
SI_CONTEXT_INV_L2 | SI_CONTEXT_PFP_SYNC_ME;
sctx->emit_cache_flush(sctx, cs);

si_emit_thread_trace_stop(sctx, cs, family);
Expand Down
3 changes: 2 additions & 1 deletion src/gallium/drivers/radeonsi/si_state.c
Expand Up @@ -5001,7 +5001,8 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)

/* Subsequent commands must wait for all shader invocations to
* complete. */
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH;
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
SI_CONTEXT_PFP_SYNC_ME;

if (flags & PIPE_BARRIER_CONSTANT_BUFFER)
sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE;
Expand Down
7 changes: 4 additions & 3 deletions src/gallium/drivers/radeonsi/si_state_streamout.c
Expand Up @@ -112,7 +112,7 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ

/* The BUFFER_FILLED_SIZE is written using a PS_DONE event. */
if (sctx->screen->use_ngg_streamout) {
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME;

/* Wait now. This is needed to make sure that GDS is not
* busy at the end of IBs.
Expand All @@ -122,7 +122,7 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ
*/
wait_now = true;
} else {
sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH;
sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME;
}
}

Expand All @@ -133,7 +133,8 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ
if (sctx->screen->use_ngg_streamout)
si_allocate_gds(sctx);

sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH;
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
SI_CONTEXT_PFP_SYNC_ME;
}

/* Streamout buffers must be bound in 2 places:
Expand Down

0 comments on commit c532616

Please sign in to comment.