Skip to content

Commit 32a0178

Browse files
committed
vp9: sse2/ssse3/avx 16bpp loopfilter x86 simd.
1 parent 6ce0212 commit 32a0178

File tree

6 files changed

+918
-1
lines changed

6 files changed

+918
-1
lines changed

libavcodec/x86/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,7 @@ YASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp.o
160160
YASM-OBJS-$(CONFIG_VP9_DECODER) += x86/vp9intrapred.o \
161161
x86/vp9itxfm.o \
162162
x86/vp9lpf.o \
163+
x86/vp9lpf_16bpp.o \
163164
x86/vp9mc.o \
164165
x86/vp9mc_16bpp.o
165166
YASM-OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp.o

libavcodec/x86/constants.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@ DECLARE_ALIGNED(32, const ymm_reg, ff_pw_1024) = { 0x0400040004000400ULL, 0x040
5555
0x0400040004000400ULL, 0x0400040004000400ULL};
5656
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_2048) = { 0x0800080008000800ULL, 0x0800080008000800ULL,
5757
0x0800080008000800ULL, 0x0800080008000800ULL };
58+
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_4095) = { 0x0fff0fff0fff0fffULL, 0x0fff0fff0fff0fffULL,
59+
0x0fff0fff0fff0fffULL, 0x0fff0fff0fff0fffULL };
5860
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_4096) = { 0x1000100010001000ULL, 0x1000100010001000ULL,
5961
0x1000100010001000ULL, 0x1000100010001000ULL };
6062
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_8192) = { 0x2000200020002000ULL, 0x2000200020002000ULL,

libavcodec/x86/constants.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ extern const ymm_reg ff_pw_512;
4747
extern const ymm_reg ff_pw_1023;
4848
extern const ymm_reg ff_pw_1024;
4949
extern const ymm_reg ff_pw_2048;
50+
extern const ymm_reg ff_pw_4095;
5051
extern const ymm_reg ff_pw_4096;
5152
extern const ymm_reg ff_pw_8192;
5253
extern const ymm_reg ff_pw_m1;

libavcodec/x86/vp9dsp_init_16bpp_template.c

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,16 +65,106 @@ filters_8tap_1d_fn2(put, 16, BPC, avx2, 16bpp)
6565
filters_8tap_1d_fn2(avg, 16, BPC, avx2, 16bpp)
6666
#endif
6767

68+
#define decl_lpf_func(dir, wd, bpp, opt) \
69+
void ff_vp9_loop_filter_##dir##_##wd##_##bpp##_##opt(uint8_t *dst, ptrdiff_t stride, \
70+
int E, int I, int H)
71+
72+
#define decl_lpf_funcs(dir, wd, bpp) \
73+
decl_lpf_func(dir, wd, bpp, sse2); \
74+
decl_lpf_func(dir, wd, bpp, ssse3); \
75+
decl_lpf_func(dir, wd, bpp, avx)
76+
77+
#define decl_lpf_funcs_wd(dir) \
78+
decl_lpf_funcs(dir, 4, BPC); \
79+
decl_lpf_funcs(dir, 8, BPC); \
80+
decl_lpf_funcs(dir, 16, BPC)
81+
82+
decl_lpf_funcs_wd(h);
83+
decl_lpf_funcs_wd(v);
84+
85+
#define lpf_16_wrapper(dir, off, bpp, opt) \
86+
static void loop_filter_##dir##_16_##bpp##_##opt(uint8_t *dst, ptrdiff_t stride, \
87+
int E, int I, int H) \
88+
{ \
89+
ff_vp9_loop_filter_##dir##_16_##bpp##_##opt(dst, stride, E, I, H); \
90+
ff_vp9_loop_filter_##dir##_16_##bpp##_##opt(dst + off, stride, E, I, H); \
91+
}
92+
93+
#define lpf_16_wrappers(bpp, opt) \
94+
lpf_16_wrapper(h, 8 * stride, bpp, opt); \
95+
lpf_16_wrapper(v, 16, bpp, opt)
96+
97+
lpf_16_wrappers(BPC, sse2);
98+
lpf_16_wrappers(BPC, ssse3);
99+
lpf_16_wrappers(BPC, avx);
100+
101+
#define lpf_mix2_wrapper(dir, off, wd1, wd2, bpp, opt) \
102+
static void loop_filter_##dir##_##wd1##wd2##_##bpp##_##opt(uint8_t *dst, ptrdiff_t stride, \
103+
int E, int I, int H) \
104+
{ \
105+
ff_vp9_loop_filter_##dir##_##wd1##_##bpp##_##opt(dst, stride, \
106+
E & 0xff, I & 0xff, H & 0xff); \
107+
ff_vp9_loop_filter_##dir##_##wd2##_##bpp##_##opt(dst + off, stride, \
108+
E >> 8, I >> 8, H >> 8); \
109+
}
110+
111+
#define lpf_mix2_wrappers(wd1, wd2, bpp, opt) \
112+
lpf_mix2_wrapper(h, 8 * stride, wd1, wd2, bpp, opt); \
113+
lpf_mix2_wrapper(v, 16, wd1, wd2, bpp, opt)
114+
115+
#define lpf_mix2_wrappers_set(bpp, opt) \
116+
lpf_mix2_wrappers(4, 4, bpp, opt); \
117+
lpf_mix2_wrappers(4, 8, bpp, opt); \
118+
lpf_mix2_wrappers(8, 4, bpp, opt); \
119+
lpf_mix2_wrappers(8, 8, bpp, opt); \
120+
121+
lpf_mix2_wrappers_set(BPC, sse2);
122+
lpf_mix2_wrappers_set(BPC, ssse3);
123+
lpf_mix2_wrappers_set(BPC, avx);
68124
#endif /* HAVE_YASM */
69125

70126
av_cold void INIT_FUNC(VP9DSPContext *dsp)
71127
{
72128
#if HAVE_YASM
73129
int cpu_flags = av_get_cpu_flags();
74130

131+
#define init_lpf_8_func(idx1, idx2, dir, wd, bpp, opt) \
132+
dsp->loop_filter_8[idx1][idx2] = ff_vp9_loop_filter_##dir##_##wd##_##bpp##_##opt
133+
#define init_lpf_16_func(idx, dir, bpp, opt) \
134+
dsp->loop_filter_16[idx] = loop_filter_##dir##_16_##bpp##_##opt
135+
#define init_lpf_mix2_func(idx1, idx2, idx3, dir, wd1, wd2, bpp, opt) \
136+
dsp->loop_filter_mix2[idx1][idx2][idx3] = loop_filter_##dir##_##wd1##wd2##_##bpp##_##opt
137+
138+
#define init_lpf_funcs(bpp, opt) \
139+
init_lpf_8_func(0, 0, h, 4, bpp, opt); \
140+
init_lpf_8_func(0, 1, v, 4, bpp, opt); \
141+
init_lpf_8_func(1, 0, h, 8, bpp, opt); \
142+
init_lpf_8_func(1, 1, v, 8, bpp, opt); \
143+
init_lpf_8_func(2, 0, h, 16, bpp, opt); \
144+
init_lpf_8_func(2, 1, v, 16, bpp, opt); \
145+
init_lpf_16_func(0, h, bpp, opt); \
146+
init_lpf_16_func(1, v, bpp, opt); \
147+
init_lpf_mix2_func(0, 0, 0, h, 4, 4, bpp, opt); \
148+
init_lpf_mix2_func(0, 1, 0, h, 4, 8, bpp, opt); \
149+
init_lpf_mix2_func(1, 0, 0, h, 8, 4, bpp, opt); \
150+
init_lpf_mix2_func(1, 1, 0, h, 8, 8, bpp, opt); \
151+
init_lpf_mix2_func(0, 0, 1, v, 4, 4, bpp, opt); \
152+
init_lpf_mix2_func(0, 1, 1, v, 4, 8, bpp, opt); \
153+
init_lpf_mix2_func(1, 0, 1, v, 8, 4, bpp, opt); \
154+
init_lpf_mix2_func(1, 1, 1, v, 8, 8, bpp, opt)
155+
75156
if (EXTERNAL_SSE2(cpu_flags)) {
76157
init_subpel3(0, put, BPC, sse2);
77158
init_subpel3(1, avg, BPC, sse2);
159+
init_lpf_funcs(BPC, sse2);
160+
}
161+
162+
if (EXTERNAL_SSSE3(cpu_flags)) {
163+
init_lpf_funcs(BPC, ssse3);
164+
}
165+
166+
if (EXTERNAL_AVX(cpu_flags)) {
167+
init_lpf_funcs(BPC, avx);
78168
}
79169

80170
if (EXTERNAL_AVX2(cpu_flags)) {

0 commit comments

Comments
 (0)