Skip to content

Commit

Permalink
p010→nv12変換を追加し、avswで10bitをhwデコードするとエラー終了する問題を修正。
Browse files Browse the repository at this point in the history
  • Loading branch information
rigaya committed May 12, 2024
1 parent b232794 commit 51e1594
Show file tree
Hide file tree
Showing 4 changed files with 107 additions and 0 deletions.
15 changes: 15 additions & 0 deletions QSVEnc/QSVEnc_readme.txt
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,21 @@ API v1.1 … Intel Media SDK v2.0


【どうでもいいメモ】
2024.05.12 (7.64)
- ffmpeg 7.0に更新。(Windows版)
- ffmpeg 6.1 -> 7.0
- libpng 1.4.0 -> 1.4.3
- expat 2.5.0 -> 2.6.2
- opus 1.4 -> 1.5.2
- libxml2 2.12.0 -> 2.12.6
- dav1d 1.3.0 -> 1.4.1
- libvpl 2.11.0 (new!)
- nv-codec-headers 12.2.72.0 (new!)
- avswで使用するデコーダを指定可能に。
- --audio-bitrateの指定がないとき、デフォルトのビットレートを設定するのではなく、コーデックに任せるように。
- --audio-bitrateあるいは--audio-copyで指定のない音声/字幕/データトラックは処理しないように。
- QSVEnc 7.62以降のside_dataの扱いが誤っており、--master-display copy/--max-cll copyが正常に行われていなかった問題を修正。

2024.04.28 (7.63)
- 新たなノイズ除去フィルタを追加。(--vpp-nlmeans)
- --audio-resamplerを拡張し、文字列でパラメータ設定できるように。
Expand Down
37 changes: 37 additions & 0 deletions QSVPipeline/convert_csp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ void copy_p010_to_p010_sse2(void **dst, const void **src, int width, int src_y_p
void copy_nv12_to_nv12_avx2(void **dst, const void **src, int width, int src_y_pitch_byte, int src_uv_pitch_byte, int dst_y_pitch_byte, int height, int dst_height, int thread_id, int thread_n, int *crop);
void copy_p010_to_p010_avx2(void **dst, const void **src, int width, int src_y_pitch_byte, int src_uv_pitch_byte, int dst_y_pitch_byte, int height, int dst_height, int thread_id, int thread_n, int *crop);

void copy_p010_to_nv12_avx2(void **dst, const void **src, int width, int src_y_pitch_byte, int src_uv_pitch_byte, int dst_y_pitch_byte, int height, int dst_height, int thread_id, int thread_n, int *crop);
void copy_nv12_to_p010_avx2(void **dst, const void **src, int width, int src_y_pitch_byte, int src_uv_pitch_byte, int dst_y_pitch_byte, int height, int dst_height, int thread_id, int thread_n, int *crop);

void convert_yuy2_to_nv12(void **dst, const void **src, int width, int src_y_pitch_byte, int src_uv_pitch_byte, int dst_y_pitch_byte, int height, int dst_height, int thread_id, int thread_n, int *crop);
void convert_yuy2_to_nv12_sse2(void **dst, const void **src, int width, int src_y_pitch_byte, int src_uv_pitch_byte, int dst_y_pitch_byte, int height, int dst_height, int thread_id, int thread_n, int *crop);
void convert_yuy2_to_nv12_avx(void **dst, const void **src, int width, int src_y_pitch_byte, int src_uv_pitch_byte, int dst_y_pitch_byte, int height, int dst_height, int thread_id, int thread_n, int *crop);
Expand Down Expand Up @@ -240,6 +243,36 @@ void copy_p010_to_p010_c(void **dst, const void **src, int width, int src_y_pitc
return copy_nv12_to_nv12_c_internal<true>(dst, src, width, src_y_pitch_byte, src_uv_pitch_byte, dst_y_pitch_byte, height, dst_height, thread_id, thread_n, crop);
}

template<typename Tin, int in_bit_depth, typename Tout, int out_bit_depth>
void copy_nv12p010_to_nv12p010_c_internal(void **dst, const void **src, int width, int src_y_pitch_byte, int src_uv_pitch_byte, int dst_y_pitch_byte, int height, int dst_height, int thread_id, int thread_n, int *crop) {
const int crop_left = crop[0];
const int crop_up = crop[1];
const int crop_right = crop[2];
const int crop_bottom = crop[3];
for (int i = 0; i < 2; i++) {
const auto y_range = thread_y_range(crop_up >> i, (height - crop_bottom) >> i, thread_id, thread_n);
const uint8_t *srcYLine = ((const uint8_t *)src[i] + src_y_pitch_byte * y_range.start_src + crop_left * sizeof(uint16_t));
uint8_t *dstLine = (uint8_t *)dst[i] + dst_y_pitch_byte * y_range.start_dst;
const int y_width = width - crop_right - crop_left;
for (int y = 0; y < y_range.len; y++, srcYLine += src_y_pitch_byte, dstLine += dst_y_pitch_byte) {
const int x_fin = width - crop_right - crop_left;
const Tin *ptrSrc = (const Tin *)srcYLine;
Tout *ptrDst = (Tout *)dstLine;
for (int x = 0; x < x_fin; x++) {
ptrDst[x] = (Tout)conv_bit_depth<out_bit_depth, in_bit_depth, 0>(ptrSrc[x]);
}
}
}
}

void copy_p010_to_nv12_c(void **dst, const void **src, int width, int src_y_pitch_byte, int src_uv_pitch_byte, int dst_y_pitch_byte, int height, int dst_height, int thread_id, int thread_n, int *crop) {
return copy_nv12p010_to_nv12p010_c_internal<uint8_t, 8, uint16_t, 16>(dst, src, width, src_y_pitch_byte, src_uv_pitch_byte, dst_y_pitch_byte, height, dst_height, thread_id, thread_n, crop);
}

void copy_nv12_to_p010_c(void **dst, const void **src, int width, int src_y_pitch_byte, int src_uv_pitch_byte, int dst_y_pitch_byte, int height, int dst_height, int thread_id, int thread_n, int *crop) {
return copy_nv12p010_to_nv12p010_c_internal<uint16_t, 16, uint8_t, 8>(dst, src, width, src_y_pitch_byte, src_uv_pitch_byte, dst_y_pitch_byte, height, dst_height, thread_id, thread_n, crop);
}

void convert_yuy2_to_nv12(void **dst_array, const void **src_array, int width, int src_y_pitch_byte, int src_uv_pitch_byte, int dst_y_pitch_byte, int height, int dst_height, int thread_id, int thread_n, int *crop) {
int crop_left = crop[0];
int crop_up = crop[1];
Expand Down Expand Up @@ -1989,6 +2022,10 @@ static const ConvertCSP funcList[] = {
FUNC_AVX2( RGY_CSP_P010, RGY_CSP_P010, false, copy_p010_to_p010_avx2, copy_p010_to_p010_avx2, AVX2|AVX)
FUNC_SSE( RGY_CSP_P010, RGY_CSP_P010, false, copy_p010_to_p010_sse2, copy_p010_to_p010_sse2, SSE2 )
FUNC__C_( RGY_CSP_P010, RGY_CSP_P010, false, copy_p010_to_p010_c, copy_p010_to_p010_c, NONE)
FUNC_AVX2( RGY_CSP_NV12, RGY_CSP_P010, false, copy_nv12_to_p010_avx2, copy_nv12_to_p010_avx2, AVX2|AVX)
FUNC__C_( RGY_CSP_NV12, RGY_CSP_P010, false, copy_nv12_to_p010_c, copy_nv12_to_p010_c, NONE)
FUNC_AVX2( RGY_CSP_P010, RGY_CSP_NV12, false, copy_p010_to_nv12_avx2, copy_p010_to_nv12_avx2, AVX2|AVX)
FUNC__C_( RGY_CSP_P010, RGY_CSP_NV12, false, copy_p010_to_nv12_c, copy_p010_to_nv12_c, NONE)
#endif
#if !CLFILTERS_AUF
FUNC_AVX2( RGY_CSP_YUY2, RGY_CSP_NV12, false, convert_yuy2_to_nv12_avx2, convert_yuy2_to_nv12_i_avx2, AVX2|AVX)
Expand Down
54 changes: 54 additions & 0 deletions QSVPipeline/convert_csp_avx2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,60 @@ void copy_nv12_to_nv12_avx2(void **dst, const void **src, int width, int src_y_p
void copy_p010_to_p010_avx2(void **dst, const void **src, int width, int src_y_pitch_byte, int src_uv_pitch_byte, int dst_y_pitch_byte, int height, int dst_height, int thread_id, int thread_n, int *crop) {
return copy_nv12_to_nv12_avx2_internal<true>(dst, src, width, src_y_pitch_byte, src_uv_pitch_byte, dst_y_pitch_byte, height, dst_height, thread_id, thread_n, crop);
}
void copy_nv12_to_p010_avx2(void **dst, const void **src, int width, int src_y_pitch_byte, int src_uv_pitch_byte, int dst_y_pitch_byte, int height, int dst_height, int thread_id, int thread_n, int *crop) {
const int crop_left = crop[0];
const int crop_up = crop[1];
const int crop_right = crop[2];
const int crop_bottom = crop[3];
for (int i = 0; i < 2; i++) {
const auto y_range = thread_y_range(crop_up >> i, (height - crop_bottom) >> i, thread_id, thread_n);
const uint8_t *srcYLine = (const uint8_t *)src[i] + src_y_pitch_byte * y_range.start_src + crop_left;
uint8_t *dstLine = (uint8_t *)dst[i] + dst_y_pitch_byte * y_range.start_dst;
const int y_width = width - crop_right - crop_left;
for (int y = 0; y < y_range.len; y++, srcYLine += src_y_pitch_byte, dstLine += dst_y_pitch_byte) {
const uint8_t *src_ptr = (const uint8_t *)srcYLine;
uint16_t *dst_ptr = (uint16_t *)dstLine;
for (int x = 0; x < y_width; x += 32, dst_ptr += 32, src_ptr += 32) {
__m256i y0, y1;
y0 = _mm256_loadu_si256((const __m256i *)src_ptr);
y0 = _mm256_permute4x64_epi64(y0, _MM_SHUFFLE(3, 1, 2, 0));
y1 = _mm256_unpackhi_epi8(_mm256_setzero_si256(), y0);
y0 = _mm256_unpacklo_epi8(_mm256_setzero_si256(), y0);
_mm256_storeu_si256((__m256i *)(dst_ptr + 0), y0);
_mm256_storeu_si256((__m256i *)(dst_ptr + 16), y1);
}
}
}
}
void copy_p010_to_nv12_avx2(void **dst, const void **src, int width, int src_y_pitch_byte, int src_uv_pitch_byte, int dst_y_pitch_byte, int height, int dst_height, int thread_id, int thread_n, int *crop) {
const int crop_left = crop[0];
const int crop_up = crop[1];
const int crop_right = crop[2];
const int crop_bottom = crop[3];
const int in_bit_depth = 16;
const __m256i yrsftAdd = _mm256_set1_epi16((short)conv_bit_depth_rsft_add<in_bit_depth, 8, 0>());
for (int i = 0; i < 2; i++) {
const auto y_range = thread_y_range(crop_up >> i, (height - crop_bottom) >> i, thread_id, thread_n);
const uint8_t *srcYLine = (const uint8_t *)src[i] + src_y_pitch_byte * y_range.start_src + crop_left;
uint8_t *dstLine = (uint8_t *)dst[i] + dst_y_pitch_byte * y_range.start_dst;
const int y_width = width - crop_right - crop_left;
for (int y = 0; y < y_range.len; y++, srcYLine += src_y_pitch_byte, dstLine += dst_y_pitch_byte) {
const uint16_t *src_ptr = (const uint16_t *)srcYLine;
uint8_t *dst_ptr = dstLine;
for (int x = 0; x < y_width; x += 32, dst_ptr += 32, src_ptr += 32) {
__m256i y0 = _mm256_loadu2_m128i((const __m128i *)(src_ptr + 16), (const __m128i *)(src_ptr + 0));
__m256i y1 = _mm256_loadu2_m128i((const __m128i *)(src_ptr + 24), (const __m128i *)(src_ptr + 8));
y0 = _mm256_adds_epi16(y0, yrsftAdd);
y1 = _mm256_adds_epi16(y1, yrsftAdd);
y0 = _mm256_srli_epi16(y0, in_bit_depth - 8);
y1 = _mm256_srli_epi16(y1, in_bit_depth - 8);
y0 = _mm256_packus_epi16(y0, y1);
_mm256_storeu_si256((__m256i *)dst_ptr, y0);
}

}
}
}

void convert_yuy2_to_nv12_avx2(void **dst_array, const void **src_array, int width, int src_y_pitch_byte, int src_uv_pitch_byte, int dst_y_pitch_byte, int height, int dst_height, int thread_id, int thread_n, int *crop) {
const int crop_left = crop[0];
Expand Down
1 change: 1 addition & 0 deletions QSVPipeline/rgy_avutil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -840,6 +840,7 @@ static const auto CSP_PIXFMT_RGY = make_array<std::pair<AVPixelFormat, RGY_CSP>>
std::make_pair(AV_PIX_FMT_YUVJ420P, RGY_CSP_YV12),
std::make_pair(AV_PIX_FMT_NV12, RGY_CSP_NV12),
std::make_pair(AV_PIX_FMT_NV21, RGY_CSP_NV12),
std::make_pair(AV_PIX_FMT_P010LE, RGY_CSP_P010),
std::make_pair(AV_PIX_FMT_YUV422P, RGY_CSP_YUV422),
std::make_pair(AV_PIX_FMT_YUVJ422P, RGY_CSP_YUV422),
std::make_pair(AV_PIX_FMT_YUYV422, RGY_CSP_YUY2),
Expand Down

0 comments on commit 51e1594

Please sign in to comment.