diff --git a/NVEncCore/NVEncFilterDelogo.cu b/NVEncCore/NVEncFilterDelogo.cu index 28377db7..a4c13fed 100644 --- a/NVEncCore/NVEncFilterDelogo.cu +++ b/NVEncCore/NVEncFilterDelogo.cu @@ -1137,6 +1137,60 @@ __global__ void kernel_create_adjust_mask2( } } +//処理単位: 4要素/thread +//ブロック構成: DELOGO_BLOCK_X * DELOGO_BLOCK_Y +template +__global__ void kernel_create_adjust_mask3( + uint8_t *__restrict__ ptr_dst_adjusted_mask, //TypeMask4 + const int *__restrict__ ptr_temp_valid_mask_count, const int block_count, + const uint8_t *__restrict__ ptr_src_adjusted_mask, //TypeMask4 + const int mask_pitch, const int mask_size, const int logo_w, const int logo_h, + const int target_count +) { + const int imgx = blockIdx.x * DELOGO_BLOCK_X /*blockDim.x*/ + threadIdx.x; + const int imgy = (blockIdx.y * DELOGO_BLOCK_Y + threadIdx.y); + const int lid = threadIdx.y * DELOGO_BLOCK_X + threadIdx.x; + + __shared__ int mask_count[DELOGO_ADJMASK_DIV_COUNT]; + if (lid < DELOGO_ADJMASK_DIV_COUNT) { + mask_count[lid] = 0; + } + for (int j = threadIdx.y; j < DELOGO_ADJMASK_DIV_COUNT; j += DELOGO_BLOCK_Y) { + int tmp = 0; + for (int i = threadIdx.x; i < block_count; i += DELOGO_BLOCK_X) { + tmp += ptr_temp_valid_mask_count[j * block_count + i]; + } + tmp = warp_sum(tmp); + const int lane = lid & (WARP_SIZE - 1); + if (lane == 0) { + atomicAdd(&mask_count[j], tmp); + } + } + __syncthreads(); + + __shared__ int shared_tmp; + if (lid == 0) { + for (int i = 0; i < DELOGO_ADJMASK_DIV_COUNT; i++) { + if (mask_count[i] >= target_count) { + shared_tmp = i; + break; + } + } +#if DELOGO_DEBUG_CUDA + for (int i = 0; i < DELOGO_ADJMASK_DIV_COUNT; i++) { + printf("mask_count[%d]=%d\n", i, mask_count[i]); + } +#endif + } + __syncthreads(); + if (imgx < logo_w && imgy < logo_h) { + ptr_src_adjusted_mask += shared_tmp * mask_size; + ptr_src_adjusted_mask += imgy * mask_pitch + imgx * sizeof(TypeMask4); + ptr_dst_adjusted_mask += imgy * mask_pitch + imgx * sizeof(TypeMask4); + *(TypeMask4 *)ptr_dst_adjusted_mask = *(TypeMask4 *)ptr_src_adjusted_mask; + } +} + NVENCSTATUS NVEncFilterDelogo::createLogoMask(int maskThreshold) { const auto pLogoData = &m_sProcessData[LOGO__Y]; if (pLogoData->width % 4 != 0) { @@ -1344,7 +1398,7 @@ NVENCSTATUS NVEncFilterDelogo::createAdjustedMask(const FrameInfo *frame_logo) { const int target_count = (int)((float)valid_mask_count * (1.0f - rate)); const dim3 gridSize2(gridSize.x, gridSize.y, DELOGO_ADJMASK_DIV_COUNT); - kernel_create_adjust_mask2<<>>( + kernel_create_adjust_mask2<<>>( (uint8_t *)m_adjMaskThresholdTest->frame.ptr, //TypeMask4 (int *)m_adjMask2ValidMaskCount.ptrDevice, (const uint8_t *)m_bufEval[0]->frame.ptr, //TypeMask4 @@ -1364,7 +1418,18 @@ NVENCSTATUS NVEncFilterDelogo::createAdjustedMask(const FrameInfo *frame_logo) { cudaerr = cudaThreadSynchronize(); debug_out_csv(m_adjMaskThresholdTest.get(), _T("m_adjMaskThresholdTest.csv")); #endif - +#if 1 + kernel_create_adjust_mask3<<>>( + (uint8_t *)m_maskAdjusted->frame.ptr, //TypeMask4 + (const int *)m_adjMask2ValidMaskCount.ptrDevice, blockCount, + (const uint8_t *)m_adjMaskThresholdTest->frame.ptr, //TypeMask4 + m_maskAdjusted->frame.pitch, m_maskAdjusted->frame.pitch * logo_h, + logo_w, logo_h, target_count); +#if DELOGO_DEBUG_CUDA + cudaerr = cudaThreadSynchronize(); +#endif + cudaEventRecord(*m_adjMaskStream.heEvalCopyFin.get(), stream); +#else m_adjMask2ValidMaskCount.copyDtoH(); if (cudaerr != cudaSuccess) { AddMessage(RGY_LOG_ERROR, _T("failed to copy data from GPU(m_adjMask2ValidMaskCount): %s.\n"), @@ -1399,6 +1464,7 @@ NVENCSTATUS NVEncFilterDelogo::createAdjustedMask(const FrameInfo *frame_logo) { return NV_ENC_ERR_INVALID_CALL; } cudaEventRecord(*m_adjMaskStream.heEvalCopyFin.get(), *m_adjMaskStream.stEvalSub.get()); +#endif return NV_ENC_SUCCESS; }