New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Rewrite the blur shader to remove the variable-length loop and to use the texture filtering hardware more effectively. #3028
Changes from 1 commit
b9ae5a3
28f977c
e509c87
4684f99
File filter...
Jump to…
Rewrite the blur shader to remove the variable-length loop and to use…
… the texture filtering hardware more effectively. This new shader performs Gaussian *resampling* instead of regular convolution. It samples in between texels to reduce the number of taps. The speed is about the same as the existing technique. It is nevertheless the fastest blur method that I could come up with. In particular, it exceeds the performance of the Kawase and dual Kawase blur techniques. The speed comes from working at lower resolution and incurring fewer downsampling and upsampling passes. As is often the case, ALU performance does not really seem to be the limiting factor; it's mostly memory bandwidth, which is why downsampling is so important. Further improvements should come from not doing the downsampling at all and instead rendering the blurred content at low resolution to begin with. Closes #2821.
- Loading branch information
| @@ -5,131 +5,120 @@ | ||
| #include shared,prim_shared | ||
|
|
||
| varying vec3 vUv; | ||
| flat varying vec4 vUvRect; | ||
| flat varying vec2 vOffsetScale; | ||
| flat varying float vSigma; | ||
| flat varying int vBlurRadius; | ||
| flat varying vec2 vSrcSizeInv; | ||
| flat varying vec4 vSrcRect; | ||
| // The coefficient and `exp(2.0 * coefficient)`, in that order. | ||
| flat varying vec2 vCoefficients; | ||
| flat varying int vVertical; | ||
|
|
||
| #ifdef WR_FEATURE_COLOR_TARGET | ||
| #define TEXTURE_SIZE() vec2(textureSize(sCacheRGBA8, 0).xy) | ||
| #else | ||
| #define TEXTURE_SIZE() vec2(textureSize(sCacheA8, 0).xy) | ||
| #endif | ||
|
|
||
| #ifdef WR_VERTEX_SHADER | ||
| // Applies a separable gaussian blur in one direction, as specified | ||
| // by the dir field in the blur command. | ||
|
|
||
| #define DIR_HORIZONTAL 0 | ||
| #define DIR_VERTICAL 1 | ||
|
|
||
| in int aBlurRenderTaskAddress; | ||
| in int aBlurSourceTaskAddress; | ||
| in int aBlurDirection; | ||
|
|
||
| struct BlurTask { | ||
| RenderTaskCommonData common_data; | ||
| float blur_radius; | ||
| vec2 coefficients; | ||
| int direction; | ||
| }; | ||
|
|
||
| BlurTask fetch_blur_task(int address) { | ||
| BlurTask fetchBlurTask(int address) { | ||
pcwalton
Author
Collaborator
|
||
| RenderTaskData task_data = fetch_render_task_data(address); | ||
|
|
||
| BlurTask task = BlurTask( | ||
| task_data.common_data, | ||
| task_data.data1.x | ||
| ); | ||
|
|
||
| BlurTask task = BlurTask(task_data.common_data, | ||
| task_data.data1.xy, | ||
| int(task_data.data1.z)); | ||
| return task; | ||
| } | ||
|
|
||
| void main(void) { | ||
| BlurTask blur_task = fetch_blur_task(aBlurRenderTaskAddress); | ||
| RenderTaskCommonData src_task = fetch_render_task_common_data(aBlurSourceTaskAddress); | ||
| BlurTask blurTask = fetchBlurTask(aBlurRenderTaskAddress); | ||
| RenderTaskCommonData srcTask = fetch_render_task_common_data(aBlurSourceTaskAddress); | ||
|
|
||
| RectWithSize src_rect = src_task.task_rect; | ||
| RectWithSize target_rect = blur_task.common_data.task_rect; | ||
|
|
||
| #if defined WR_FEATURE_COLOR_TARGET | ||
| vec2 texture_size = vec2(textureSize(sCacheRGBA8, 0).xy); | ||
| #else | ||
| vec2 texture_size = vec2(textureSize(sCacheA8, 0).xy); | ||
| #endif | ||
| vUv.z = src_task.texture_layer_index; | ||
| vBlurRadius = int(3.0 * blur_task.blur_radius); | ||
| vSigma = blur_task.blur_radius; | ||
|
|
||
| switch (aBlurDirection) { | ||
| case DIR_HORIZONTAL: | ||
| vOffsetScale = vec2(1.0 / texture_size.x, 0.0); | ||
| break; | ||
| case DIR_VERTICAL: | ||
| vOffsetScale = vec2(0.0, 1.0 / texture_size.y); | ||
| break; | ||
| default: | ||
| vOffsetScale = vec2(0.0); | ||
| } | ||
| RectWithSize srcRect = srcTask.task_rect; | ||
| RectWithSize targetRect = blurTask.common_data.task_rect; | ||
|
|
||
| vUvRect = vec4(src_rect.p0 + vec2(0.5), | ||
| src_rect.p0 + src_rect.size - vec2(0.5)); | ||
| vUvRect /= texture_size.xyxy; | ||
| vec2 position = targetRect.p0 + targetRect.size * aPosition.xy; | ||
|
|
||
| vec2 pos = target_rect.p0 + target_rect.size * aPosition.xy; | ||
| vec4 uvBounds = vec4(srcRect.p0, srcRect.p0 + srcRect.size); | ||
|
|
||
| vec2 uv0 = src_rect.p0 / texture_size; | ||
| vec2 uv1 = (src_rect.p0 + src_rect.size) / texture_size; | ||
| vUv.xy = mix(uv0, uv1, aPosition.xy); | ||
| vUv = vec3(mix(uvBounds.xy, uvBounds.zw, aPosition.xy), srcTask.texture_layer_index); | ||
| vSrcSizeInv = 1.0 / TEXTURE_SIZE(); | ||
| vSrcRect = vec4(srcRect.p0, srcRect.p0 + srcRect.size) + vec4(0.5, 0.5, -0.5, -0.5); | ||
| vCoefficients = blurTask.coefficients; | ||
| vVertical = blurTask.direction; | ||
|
|
||
| gl_Position = uTransform * vec4(pos, 0.0, 1.0); | ||
| gl_Position = uTransform * vec4(position, 0.0, 1.0); | ||
| } | ||
|
|
||
| #endif | ||
|
|
||
| #ifdef WR_FRAGMENT_SHADER | ||
|
|
||
| #if defined WR_FEATURE_COLOR_TARGET | ||
| #define SUPPORT 4 | ||
|
||
|
|
||
| #ifdef WR_FEATURE_COLOR_TARGET | ||
| #define SAMPLE_TYPE vec4 | ||
| #define SAMPLE_TEXTURE(uv) texture(sCacheRGBA8, uv) | ||
| #else | ||
| #define SAMPLE_TYPE float | ||
| #define SAMPLE_TEXTURE(uv) texture(sCacheA8, uv).r | ||
| #endif | ||
|
|
||
| // TODO(gw): Write a fast path blur that handles smaller blur radii | ||
| // with a offset / weight uniform table and a constant | ||
| // loop iteration count! | ||
|
|
||
| // TODO(gw): Make use of the bilinear sampling trick to reduce | ||
| // the number of texture fetches needed for a gaussian blur. | ||
| // Accumulates two texels into the blurred fragment we're building up. | ||
| void accumulate(float offset, | ||
| float crossAxisCoord, | ||
| inout vec2 gaussCoefficient, | ||
| inout SAMPLE_TYPE colorSum, | ||
| inout float factorSum) { | ||
| float factorA = gaussCoefficient.x; | ||
| gaussCoefficient *= vec2(gaussCoefficient.y, vCoefficients.y); | ||
| float factorB = gaussCoefficient.x; | ||
| gaussCoefficient *= vec2(gaussCoefficient.y, vCoefficients.y); | ||
|
|
||
| // Compute the texture coordinate that provides the correct linear combination of the two | ||
| // texels in question. | ||
| float factors = factorA + factorB; | ||
| float sampleOffset = offset + factorB / factors; | ||
|
|
||
| vec2 texCoord = vec2(sampleOffset, crossAxisCoord); | ||
| texCoord = clamp(vVertical != 0 ? texCoord.yx : texCoord.xy, vSrcRect.xy, vSrcRect.zw); | ||
|
|
||
| colorSum += factors * SAMPLE_TEXTURE(vec3(texCoord * vSrcSizeInv, vUv.z)); | ||
| factorSum += factors; | ||
| } | ||
|
|
||
| void main(void) { | ||
| SAMPLE_TYPE original_color = SAMPLE_TEXTURE(vUv); | ||
|
|
||
| // TODO(gw): The gauss function gets NaNs when blur radius | ||
| // is zero. In the future, detect this earlier | ||
| // and skip the blur passes completely. | ||
| if (vBlurRadius == 0) { | ||
| oFragColor = vec4(original_color); | ||
| // FIXME(pcwalton): We shouldn't end up with zero blur radii in the first place! | ||
| if (vCoefficients.x == 0.0) { | ||
| vec2 texCoord = clamp(vUv.xy, vSrcRect.xy, vSrcRect.zw); | ||
| oFragColor = vec4(SAMPLE_TEXTURE(vec3(texCoord * vSrcSizeInv, vUv.z))); | ||
| return; | ||
| } | ||
|
|
||
| // Incremental Gaussian Coefficent Calculation (See GPU Gems 3 pp. 877 - 889) | ||
| vec3 gauss_coefficient; | ||
| gauss_coefficient.x = 1.0 / (sqrt(2.0 * 3.14159265) * vSigma); | ||
| gauss_coefficient.y = exp(-0.5 / (vSigma * vSigma)); | ||
| gauss_coefficient.z = gauss_coefficient.y * gauss_coefficient.y; | ||
|
|
||
| float gauss_coefficient_sum = 0.0; | ||
| SAMPLE_TYPE avg_color = original_color * gauss_coefficient.x; | ||
| gauss_coefficient_sum += gauss_coefficient.x; | ||
| gauss_coefficient.xy *= gauss_coefficient.yz; | ||
| bool vertical = vVertical != 0; | ||
| vec2 axisCoord = vertical ? vUv.yx : vUv.xy; | ||
| float start = floor(axisCoord.x - float(SUPPORT)) + 0.5; | ||
kvark
Member
|
||
|
|
||
| for (int i=1 ; i <= vBlurRadius ; ++i) { | ||
| vec2 offset = vOffsetScale * float(i); | ||
| float offset = start - axisCoord.x; | ||
|
|
||
| vec2 st0 = clamp(vUv.xy - offset, vUvRect.xy, vUvRect.zw); | ||
| avg_color += SAMPLE_TEXTURE(vec3(st0, vUv.z)) * gauss_coefficient.x; | ||
| // See K. Turkowski, "Incremental Computation of the Gaussian", GPU Gems 3, chapter 40: | ||
| // | ||
| // https://developer.nvidia.com/gpugems/GPUGems3/gpugems3_ch40.html | ||
| vec2 gaussCoefficient = exp(vCoefficients.x * vec2(offset * offset, 2.0 * offset + 1.0)); | ||
|
|
||
| vec2 st1 = clamp(vUv.xy + offset, vUvRect.xy, vUvRect.zw); | ||
| avg_color += SAMPLE_TEXTURE(vec3(st1, vUv.z)) * gauss_coefficient.x; | ||
| SAMPLE_TYPE colorSum = SAMPLE_TYPE(0.0); | ||
| float factorSum = 0.0; | ||
|
|
||
| gauss_coefficient_sum += 2.0 * gauss_coefficient.x; | ||
| gauss_coefficient.xy *= gauss_coefficient.yz; | ||
| } | ||
| for (int i = 0; i < SUPPORT + 1; i++) | ||
| accumulate(start + float(i) * 2.0, axisCoord.y, gaussCoefficient, colorSum, factorSum); | ||
|
|
||
| oFragColor = vec4(avg_color) / gauss_coefficient_sum; | ||
| oFragColor = vec4(colorSum / factorSum); | ||
| } | ||
|
|
||
| #endif | ||
please stick to the existing naming convention in the shaders