Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Some tweaks to the pre-multiply alpha blending #1834

Merged
merged 7 commits into from May 26, 2020
189 changes: 144 additions & 45 deletions src_c/alphablit.c
Expand Up @@ -30,9 +30,17 @@
#include "include/sse2neon.h"
#else
#if IS_SDLv1
// MSVC uses these defines for SSE2 support for some reason
#if defined(_M_IX86_FP) || (defined(_M_AMD64) || defined(_M_X64))
#if (_M_IX86_FP == 2) || (defined(_M_AMD64) || defined(_M_X64))
#define __SSE2__ 1
#endif
#endif
// SDL 1 doesn't import the latest intrinsics, this should should pull
// them all in for us
#include <immintrin.h>
#ifdef __SSE2__ // don't import this file on non-SSE platforms.
#include <immintrin.h>
#endif /* __SSE2__ */
#endif /* IS_SDLv1 */
#endif /* PG_ENABLE_ARM_NEON */

Expand Down Expand Up @@ -270,38 +278,44 @@ SoftBlitPyGame (SDL_Surface * src, SDL_Rect * srcrect, SDL_Surface * dst,
}
case PYGAME_BLEND_PREMULTIPLIED:
{
#if defined(__MMX__) || defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)
if (src->format->Rmask == dst->format->Rmask
&& src->format->Gmask == dst->format->Gmask
&& src->format->Bmask == dst->format->Bmask
&& src->format->BytesPerPixel == 4
&& src->format->Rshift % 8 == 0
&& src->format->Gshift % 8 == 0
&& src->format->Bshift % 8 == 0
&& src->format->Ashift % 8 == 0
&& src->format->Aloss == 0){

#if PG_ENABLE_ARM_NEON
#if IS_SDLv1
if (src->format->BytesPerPixel == 4 &&
dst->format->BytesPerPixel == 4 &&
src->format->Rmask == dst->format->Rmask &&
src->format->Gmask == dst->format->Gmask &&
src->format->Bmask == dst->format->Bmask &&
info.src_flags & SDL_SRCALPHA)
#else /* IS_SDLv2 */
if (src->format->BytesPerPixel == 4 &&
dst->format->BytesPerPixel == 4 &&
src->format->Rmask == dst->format->Rmask &&
src->format->Gmask == dst->format->Gmask &&
src->format->Bmask == dst->format->Bmask &&
info.src_blend != SDL_BLENDMODE_NONE)
#endif /* IS_SDLv2 */
{
#if defined(__MMX__) || defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)
#if PG_ENABLE_ARM_NEON
if (SDL_HasNEON() == SDL_TRUE){
blit_blend_premultiplied_sse2 (&info);
break;
}
#endif /* PG_ENABLE_ARM_NEON */
#ifdef __SSE2__
if (SDL_HasSSE2() == SDL_TRUE){
#endif /* PG_ENABLE_ARM_NEON */
#ifdef __SSE2__
if (SDL_HasSSE2()){
blit_blend_premultiplied_sse2 (&info);
break;
}
#endif /* __SSE2__*/
#ifdef __MMX__
#endif /* __SSE2__*/
#ifdef __MMX__
if (SDL_HasMMX() == SDL_TRUE) {
blit_blend_premultiplied_mmx (&info);
break;
}
#endif /*__MMX__*/

#endif /*__MMX__*/
#endif /*__MMX__ || __SSE2__ || PG_ENABLE_ARM_NEON*/
}
#endif /*__MMX__ || __SSE2__ || PG_ENABLE_ARM_NEON*/

blit_blend_premultiplied (&info);
break;
}
Expand Down Expand Up @@ -1100,13 +1114,16 @@ blit_blend_premultiplied_sse2(SDL_BlitInfo * info)
int dstskip = info->d_skip >> 2;
SDL_PixelFormat *srcfmt = info->src;
Uint32 amask = srcfmt->Amask;
Uint64 multmask2;
Uint64 multmask;
Uint64 ones;

__m128i src1, dst1, mm_alpha, mm_zero, mm_alpha2, multmask2_128;
__m128i src1, dst1, sub_dst, mm_alpha, mm_zero, multmask_128, ones_128;

mm_zero = _mm_setzero_si128();
multmask2 = 0x00FF00FF00FF00FF; // 0F0F0F0F
multmask2_128 = _mm_loadl_epi64((const __m128i *) & multmask2);
multmask = 0x00FF00FF00FF00FF; // 0F0F0F0F
multmask_128 = _mm_loadl_epi64((const __m128i *) & multmask);
ones = 0x0001000100010001;
ones_128 = _mm_loadl_epi64((const __m128i *) & ones);

while (height--) {
/* *INDENT-OFF* */
Expand All @@ -1126,13 +1143,14 @@ blit_blend_premultiplied_sse2(SDL_BlitInfo * info)
mm_alpha = _mm_cvtsi32_si128(alpha); /* alpha -> mm_alpha (000000000000A000) */
mm_alpha = _mm_srli_si128(mm_alpha, 3); /* mm_alpha >> ashift -> mm_alpha(000000000000000A) */
mm_alpha = _mm_unpacklo_epi16(mm_alpha, mm_alpha); /* 0000000000000A0A -> mm_alpha */
mm_alpha2 = _mm_unpacklo_epi32(mm_alpha, mm_alpha); /* 000000000A0A0A0A -> mm_alpha2 */
mm_alpha2 = _mm_xor_si128(mm_alpha2, multmask2_128); /* 255 - mm_alpha -> mm_alpha */
mm_alpha = _mm_unpacklo_epi32(mm_alpha, mm_alpha); /* 000000000A0A0A0A -> mm_alpha2 */

/* pre-multiplied alpha blend */
dst1 = _mm_mullo_epi16(dst1, mm_alpha2);
dst1 = _mm_srli_epi16(dst1, 8);
sub_dst = _mm_add_epi16(dst1, ones_128);
sub_dst = _mm_mullo_epi16(sub_dst, mm_alpha);
sub_dst = _mm_srli_epi16(sub_dst, 8);
dst1 = _mm_add_epi16(src1, dst1);
dst1 = _mm_sub_epi16(dst1, sub_dst);
dst1 = _mm_packus_epi16(dst1, mm_zero);

*dstp = _mm_cvtsi128_si32(dst1);
Expand Down Expand Up @@ -1236,9 +1254,46 @@ blit_blend_premultiplied (SDL_BlitInfo * info)
int dstppa = info->dst_blend != SDL_BLENDMODE_NONE && dstfmt->Amask;
#endif /* IS_SDLv2 */

/*
printf ("Premultiplied alpha blit with %d and %d\n", srcbpp, dstbpp);
*/
#if IS_SDLv1
if (srcbpp >= 3 && dstbpp >= 3 && !(info->src_flags & SDL_SRCALPHA))
#else /* IS_SDLv2 */
if (srcbpp >= 3 && dstbpp >= 3 && info->src_blend == SDL_BLENDMODE_NONE)
#endif /* IS_SDLv2 */
{
size_t srcoffsetR, srcoffsetG, srcoffsetB;
size_t dstoffsetR, dstoffsetG, dstoffsetB;
if (srcbpp == 3)
{
SET_OFFSETS_24 (srcoffsetR, srcoffsetG, srcoffsetB, srcfmt);
}
else
{
SET_OFFSETS_32 (srcoffsetR, srcoffsetG, srcoffsetB, srcfmt);
}
if (dstbpp == 3)
{
SET_OFFSETS_24 (dstoffsetR, dstoffsetG, dstoffsetB, dstfmt);
}
else
{
SET_OFFSETS_32 (dstoffsetR, dstoffsetG, dstoffsetB, dstfmt);
}
while (height--)
{
LOOP_UNROLLED4(
{
dst[dstoffsetR] = src[srcoffsetR];
dst[dstoffsetG] = src[srcoffsetG];
dst[dstoffsetB] = src[srcoffsetB];

src += srcpxskip;
dst += dstpxskip;
}, n, width);
src += srcskip;
dst += dstskip;
}
return;
}

if (srcbpp == 1)
{
Expand All @@ -1250,9 +1305,30 @@ blit_blend_premultiplied (SDL_BlitInfo * info)
{
GET_PIXELVALS_1(sR, sG, sB, sA, src, srcfmt);
GET_PIXELVALS_1(dR, dG, dB, dA, dst, dstfmt);
// Source alpha is 255 so we can skip the blend and just
// use the source
CREATE_PIXEL(dst, sR, sG, sB, sA, dstbpp, dstfmt);
ALPHA_BLEND_PREMULTIPLIED (tmp, sR, sG, sB, sA, dR, dG, dB, dA);
SET_PIXELVAL (dst, dstfmt, dR, dG, dB, dA);
src += srcpxskip;
dst += dstpxskip;
}, n, width);
src += srcskip;
dst += dstskip;
}
}
else if (dstbpp == 3)
{
size_t offsetR, offsetG, offsetB;
SET_OFFSETS_24 (offsetR, offsetG, offsetB, dstfmt);
while (height--)
{
LOOP_UNROLLED4(
{
GET_PIXELVALS_1(sR, sG, sB, sA, src, srcfmt);
GET_PIXEL (pixel, dstbpp, dst);
GET_PIXELVALS (dR, dG, dB, dA, pixel, dstfmt, dstppa);
ALPHA_BLEND_PREMULTIPLIED (tmp, sR, sG, sB, sA, dR, dG, dB, dA);
dst[offsetR] = dR;
dst[offsetG] = dG;
dst[offsetB] = dB;
src += srcpxskip;
dst += dstpxskip;
}, n, width);
Expand All @@ -1269,9 +1345,8 @@ blit_blend_premultiplied (SDL_BlitInfo * info)
GET_PIXELVALS_1(sR, sG, sB, sA, src, srcfmt);
GET_PIXEL (pixel, dstbpp, dst);
GET_PIXELVALS (dR, dG, dB, dA, pixel, dstfmt, dstppa);
// Source alpha is 255 so we can skip the blend and just
// use the source
CREATE_PIXEL(dst, sR, sG, sB, sA, dstbpp, dstfmt);
ALPHA_BLEND_PREMULTIPLIED (tmp, sR, sG, sB, sA, dR, dG, dB, dA);
CREATE_PIXEL(dst, dR, dG, dB, dA, dstbpp, dstfmt);
src += srcpxskip;
dst += dstpxskip;
}, n, width);
Expand All @@ -1291,26 +1366,50 @@ blit_blend_premultiplied (SDL_BlitInfo * info)
GET_PIXEL(pixel, srcbpp, src);
GET_PIXELVALS (sR, sG, sB, sA, pixel, srcfmt, srcppa);
GET_PIXELVALS_1(dR, dG, dB, dA, dst, dstfmt);
// We can save some blending time by just copying pixels
// with alphas of 255 or 0
ALPHA_BLEND_PREMULTIPLIED (tmp, sR, sG, sB, sA, dR, dG, dB, dA);
SET_PIXELVAL (dst, dstfmt, dR, dG, dB, dA);
src += srcpxskip;
dst += dstpxskip;
}, n, width);
src += srcskip;
dst += dstskip;
}

}
else if (dstbpp == 3)
{
size_t offsetR, offsetG, offsetB;
SET_OFFSETS_24 (offsetR, offsetG, offsetB, dstfmt);
while (height--)
{
LOOP_UNROLLED4(
{
GET_PIXEL(pixel, srcbpp, src);
GET_PIXELVALS (sR, sG, sB, sA, pixel, srcfmt, srcppa);
GET_PIXEL (pixel, dstbpp, dst);
GET_PIXELVALS (dR, dG, dB, dA, pixel, dstfmt, dstppa);
if(sA == 0){
CREATE_PIXEL(dst, dR, dG, dB, dA, dstbpp, dstfmt);
dst[offsetR] = dR;
dst[offsetG] = dG;
dst[offsetB] = dB;
}
else if(sA == 255){
CREATE_PIXEL(dst, sR, sG, sB, sA, dstbpp, dstfmt);
dst[offsetR] = sR;
dst[offsetG] = sG;
dst[offsetB] = sB;
}
else{
ALPHA_BLEND_PREMULTIPLIED (tmp, sR, sG, sB, sA, dR, dG, dB, dA);
CREATE_PIXEL(dst, dR, dG, dB, dA, dstbpp, dstfmt);
dst[offsetR] = dR;
dst[offsetG] = dG;
dst[offsetB] = dB;
}

src += srcpxskip;
dst += dstpxskip;
}, n, width);
src += srcskip;
dst += dstskip;
}

}
else /* dstbpp > 1 */
{
Expand Down
3 changes: 2 additions & 1 deletion src_c/pgcompat.h
Expand Up @@ -215,7 +215,8 @@ SDL_Surface * SDL_CreateRGBSurfaceWithFormat(Uint32 flags, int width, int height
#endif
#endif /* defined(SDL_VERSION_ATLEAST) */

// Needed to build with Windows SDK 10.0.18362.0
// Currently needed to build scrap.c, event.c, display.c
// with Windows SDK 10.0.18362.0 and SDL1 build
#ifdef _MSC_VER
#ifndef WINDOWS_IGNORE_PACKING_MISMATCH
#define WINDOWS_IGNORE_PACKING_MISMATCH
Expand Down
2 changes: 2 additions & 0 deletions src_c/scrap.c
Expand Up @@ -25,6 +25,8 @@
#include <limits.h>
#include <stdio.h>

#include "pgcompat.h"

#include "SDL.h"

#include "SDL_syswm.h"
Expand Down
2 changes: 1 addition & 1 deletion src_c/surface.h
Expand Up @@ -335,7 +335,7 @@
} \
} while(0)

#define ALPHA_BLEND_PREMULTIPLIED_COMP(sC, dC, sA) (sC + ((dC * (255 - sA)) >> 8))
#define ALPHA_BLEND_PREMULTIPLIED_COMP(sC, dC, sA) (sC + dC - ((dC + 1) * sA >> 8))

#define ALPHA_BLEND_PREMULTIPLIED(tmp, sR, sG, sB, sA, dR, dG, dB, dA) \
do { \
Expand Down