Skip to content

Commit

Permalink
vpp-delogoの自動フェードで、CPUへの転送を削減して高速化。
Browse files Browse the repository at this point in the history
  • Loading branch information
rigaya committed Aug 13, 2018
1 parent 31f7fd0 commit 884b7a4
Showing 1 changed file with 68 additions and 2 deletions.
70 changes: 68 additions & 2 deletions NVEncCore/NVEncFilterDelogo.cu
Expand Up @@ -1137,6 +1137,60 @@ __global__ void kernel_create_adjust_mask2(
}
}

//処理単位: 4要素/thread
//ブロック構成: DELOGO_BLOCK_X * DELOGO_BLOCK_Y
template<typename TypeMask4>
__global__ void kernel_create_adjust_mask3(
uint8_t *__restrict__ ptr_dst_adjusted_mask, //TypeMask4
const int *__restrict__ ptr_temp_valid_mask_count, const int block_count,
const uint8_t *__restrict__ ptr_src_adjusted_mask, //TypeMask4
const int mask_pitch, const int mask_size, const int logo_w, const int logo_h,
const int target_count
) {
const int imgx = blockIdx.x * DELOGO_BLOCK_X /*blockDim.x*/ + threadIdx.x;
const int imgy = (blockIdx.y * DELOGO_BLOCK_Y + threadIdx.y);
const int lid = threadIdx.y * DELOGO_BLOCK_X + threadIdx.x;

__shared__ int mask_count[DELOGO_ADJMASK_DIV_COUNT];
if (lid < DELOGO_ADJMASK_DIV_COUNT) {
mask_count[lid] = 0;
}
for (int j = threadIdx.y; j < DELOGO_ADJMASK_DIV_COUNT; j += DELOGO_BLOCK_Y) {
int tmp = 0;
for (int i = threadIdx.x; i < block_count; i += DELOGO_BLOCK_X) {
tmp += ptr_temp_valid_mask_count[j * block_count + i];
}
tmp = warp_sum<int, WARP_SIZE>(tmp);
const int lane = lid & (WARP_SIZE - 1);
if (lane == 0) {
atomicAdd(&mask_count[j], tmp);
}
}
__syncthreads();

__shared__ int shared_tmp;
if (lid == 0) {
for (int i = 0; i < DELOGO_ADJMASK_DIV_COUNT; i++) {
if (mask_count[i] >= target_count) {
shared_tmp = i;
break;
}
}
#if DELOGO_DEBUG_CUDA
for (int i = 0; i < DELOGO_ADJMASK_DIV_COUNT; i++) {
printf("mask_count[%d]=%d\n", i, mask_count[i]);
}
#endif
}
__syncthreads();
if (imgx < logo_w && imgy < logo_h) {
ptr_src_adjusted_mask += shared_tmp * mask_size;
ptr_src_adjusted_mask += imgy * mask_pitch + imgx * sizeof(TypeMask4);
ptr_dst_adjusted_mask += imgy * mask_pitch + imgx * sizeof(TypeMask4);
*(TypeMask4 *)ptr_dst_adjusted_mask = *(TypeMask4 *)ptr_src_adjusted_mask;
}
}

NVENCSTATUS NVEncFilterDelogo::createLogoMask(int maskThreshold) {
const auto pLogoData = &m_sProcessData[LOGO__Y];
if (pLogoData->width % 4 != 0) {
Expand Down Expand Up @@ -1344,7 +1398,7 @@ NVENCSTATUS NVEncFilterDelogo::createAdjustedMask(const FrameInfo *frame_logo) {
const int target_count = (int)((float)valid_mask_count * (1.0f - rate));
const dim3 gridSize2(gridSize.x, gridSize.y, DELOGO_ADJMASK_DIV_COUNT);

kernel_create_adjust_mask2<char4, short4><<<gridSize2, blockSize>>>(
kernel_create_adjust_mask2<char4, short4><<<gridSize2, blockSize, 0, stream>>>(
(uint8_t *)m_adjMaskThresholdTest->frame.ptr, //TypeMask4
(int *)m_adjMask2ValidMaskCount.ptrDevice,
(const uint8_t *)m_bufEval[0]->frame.ptr, //TypeMask4
Expand All @@ -1364,7 +1418,18 @@ NVENCSTATUS NVEncFilterDelogo::createAdjustedMask(const FrameInfo *frame_logo) {
cudaerr = cudaThreadSynchronize();
debug_out_csv<char>(m_adjMaskThresholdTest.get(), _T("m_adjMaskThresholdTest.csv"));
#endif

#if 1
kernel_create_adjust_mask3<short4><<<gridSize, blockSize, 0, stream>>>(
(uint8_t *)m_maskAdjusted->frame.ptr, //TypeMask4
(const int *)m_adjMask2ValidMaskCount.ptrDevice, blockCount,
(const uint8_t *)m_adjMaskThresholdTest->frame.ptr, //TypeMask4
m_maskAdjusted->frame.pitch, m_maskAdjusted->frame.pitch * logo_h,
logo_w, logo_h, target_count);
#if DELOGO_DEBUG_CUDA
cudaerr = cudaThreadSynchronize();
#endif
cudaEventRecord(*m_adjMaskStream.heEvalCopyFin.get(), stream);
#else
m_adjMask2ValidMaskCount.copyDtoH();
if (cudaerr != cudaSuccess) {
AddMessage(RGY_LOG_ERROR, _T("failed to copy data from GPU(m_adjMask2ValidMaskCount): %s.\n"),
Expand Down Expand Up @@ -1399,6 +1464,7 @@ NVENCSTATUS NVEncFilterDelogo::createAdjustedMask(const FrameInfo *frame_logo) {
return NV_ENC_ERR_INVALID_CALL;
}
cudaEventRecord(*m_adjMaskStream.heEvalCopyFin.get(), *m_adjMaskStream.stEvalSub.get());
#endif
return NV_ENC_SUCCESS;
}

Expand Down

0 comments on commit 884b7a4

Please sign in to comment.