Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Partially refactor SSE2 blitters with macros #2656

Merged

Conversation

itzpr3d4t0r
Copy link
Member

@itzpr3d4t0r itzpr3d4t0r commented Jan 4, 2024

Similar to #2055, this PR tries to streamline and condense our blitter SSE2 code to make it easier to work with and understand/fix. This PR only implements the respective RGB and RGBA variants of the ADD, SUB, MULT, MIN and MAX. Makes the file 514 lines shorter.

These changes don't affect performance negatively in any case, instead they improve performance for the SSE2 MULT/RGBA_MULT cases by about 15-21% due to the old version using a worse strategy.

I used two separate programs to test, one for generating the data and another for plotting and printing results.
And a warning, the generation program has some option at the top of the files which are to ensure the testing is fair and precise. This current setup generates .json files that could be very big so watch out or reduce the REP value.

Data Program:

import pygame
import json
from timeit import repeat
from datetime import datetime

pygame.init()

screen = pygame.display.set_mode((500, 500))

tests = []
for mode in [
    "ADD",
    "RGBA_ADD",
    "SUB",
    "RGBA_SUB",
    "MULT",
    "RGBA_MULT",
    "MIN",
    "RGBA_MIN",
    "MAX",
    "RGBA_MAX",
]:
    for old in ["_old"]:
        stmt = f"base.blit(surf, (0, 0), special_flags=pygame.BLEND_{mode})"
        tests.append(("blit_sse2_" + mode + old, stmt))

MAX_SIZE = 1000
REP = 250
NUM = 1
PRINT_EVERY = 100


def run_tests(tests: list):
    # print hours and minutes and seconds of start time
    start_time = datetime.now()
    print(f"Started at {start_time.strftime('%H:%M:%S')}")

    for filename, stmt in tests:
        data = {"title": filename, "data": []}
        test_start_time = datetime.now()
        print(f"=====| {filename} |=====")

        print("<", end="")
        for size in range(1, MAX_SIZE + 1):
            if size % PRINT_EVERY == 0:
                g = 3
                amt = g * size // PRINT_EVERY
                print('\r[' + '*' * (amt) + ' ' * (g * 10 - amt) + ']', end="")

            surf = pygame.Surface((size, size)).convert_alpha()
            base = pygame.Surface((size, size)).convert_alpha()

            g = globals()
            g["surf"] = surf
            g["base"] = base
            times = repeat(stmt, number=NUM, repeat=REP, globals=g)
            data["data"].append(times)

        test_end_time = datetime.now()
        print(f"\nFinished {filename} in {test_end_time - test_start_time}")
        print("====================================\n")

        with open(f"{filename}.json", "w") as f:
            json.dump(data, f)

    # print hours and minutes and seconds of end time and delta time
    end_time = datetime.now()
    print(f"Finished at {end_time.strftime('%H:%M:%S')}")
    print(f"Total time it took: {end_time - start_time}")



run_tests(tests)
pygame.quit()

Visualization Program:

import json

from matplotlib import pyplot as plt
from statistics import mean, stdev, median

plt.style.use(["dark_background", "no-latex", "notebook"])

TEST_NAME = "Blit Test"
MODE = "MIN"
LIMIT_TO_RANGE = 500
files = []

comparative_tests = [
    # (("blit_sse2_ADD_old", "red", MODE), ("blit_sse2_ADD_new", "green", MODE)),
    # (("blit_sse2_SUB_old", "red", MODE), ("blit_sse2_SUB_new", "green", MODE)),
    # (("blit_sse2_MULT_old", "red", MODE), ("blit_sse2_MULT_new", "green", MODE)),
    # (("blit_sse2_MIN_old", "red", MODE), ("blit_sse2_MIN_new", "green", MODE)),
    # (("blit_sse2_MAX_old", "white", MODE), ("blit_sse2_MAX_new", "blue", MODE)),
    # (("blit_sse2_ADD_old", "white", MODE), ("blit_sse2_ADD_new", "blue", MODE)),
]


def plot_tests(tests: list):
    for file_name, color, mode in tests:
        try:
            with open(f"{file_name}.json", "r") as f:
                data = json.load(f)
        except FileNotFoundError:
            print(f"File {file_name}.json not found!")
            quit()

        f = mean
        match mode:
            case "MEAN":
                f = mean
            case "MIN":
                f = min
            case "MAX":
                f = max
            case "MEDIAN":
                f = median

        timings = [f(data_point) for data_point in data["data"]][:LIMIT_TO_RANGE]

        print(f"=== {file_name} ===")
        print(
            f"Total: {sum([sum(data_point) for data_point in data['data']])}\n"
            f"Mean: {mean(timings)}\n"
            f"Median: {median(timings)}\n"
            f"Stdev: {stdev(timings)}"
        )
        print()
        plt.plot(timings, color=color, label=file_name, linewidth=1)


def plot_and_compare_tests(c_tests: list):
    for test1, test2 in c_tests:
        filename_1, color_1, mode_1 = test1
        filename_2, color_2, mode_2 = test2

        try:
            with open(f"{filename_1}.json", "r") as f:
                data_1 = json.load(f)
        except FileNotFoundError:
            print(f"File {filename_1}.json not found!")
            quit()

        try:
            with open(f"{filename_2}.json", "r") as f:
                data_2 = json.load(f)
        except FileNotFoundError:
            print(f"File {filename_2}.json not found!")
            quit()

        f = mean
        match mode_1:
            case "MEAN":
                f = mean
            case "MIN":
                f = min
            case "MAX":
                f = max
            case "MEDIAN":
                f = median

        timings_1 = [f(data_point) for data_point in data_1["data"]][:LIMIT_TO_RANGE]
        timings_2 = [f(data_point) for data_point in data_2["data"]][:LIMIT_TO_RANGE]

        print(f"=== {filename_1} vs {filename_2} ===")
        print(
            f"Total 1: {sum([sum(data_point) for data_point in data_1['data']])}\n"
            f"Mean 1: {mean(timings_1)}\n"
            f"Median 1: {median(timings_1)}\n"
            f"Stdev 1: {stdev(timings_1)}"
        )
        print(
            f"Total 2: {sum([sum(data_point) for data_point in data_2['data']])}\n"
            f"Mean 2: {mean(timings_2)}\n"
            f"Median 2: {median(timings_2)}\n"
            f"Stdev 2: {stdev(timings_2)}"
        )

        print()
        plt.plot(timings_1, color=color_1, label=filename_1, linewidth=1)
        plt.plot(timings_2, color=color_2, label=filename_2, linewidth=1)


plot_and_compare_tests(comparative_tests)

plt.title(TEST_NAME)
plt.xlabel("Surface size (px)")
plt.ylabel("Time (s)")
plt.legend()
plt.show()

@itzpr3d4t0r itzpr3d4t0r added Performance Related to the speed or resource usage of the project SIMD Surface pygame.Surface labels Jan 4, 2024
@itzpr3d4t0r itzpr3d4t0r requested a review from a team as a code owner January 4, 2024 18:00
@Starbuck5
Copy link
Member

So funny story, guess what I was working on last night before you posted this PR? Starbuck5@4233e2c. I guess I'll shelve that.

Alright, here's what I think.

I don't like having the DUFF loop in the non-4-at-a-time pixel segments. It seems wasteful to me to duplicate the code a lot just so it can be run through once per height. The 4-at-a-time pixel segments could go through 800 pixels every row, the non-4-at-a-time will only ever go through 3 at most.

I'd like to keep the functions themselves (like the function definitions, the names) handwritten in the code, rather than having them defined by the macro. This way you can jump around the code in an IDE to those functions or search for them directly. This is also a style preference. Thoughts?

I'd like the blit macros to be generic seeming. Rather than a specific slot for a code, an alpha code and an alpha prep, I'd prefer a slot for code and a slot for prep. Slot for prep wouldn't even be necessary, if you handwrite the functions.

Here's an example of what I'm talking about:

void
blit_blend_rgb_add_sse2(SDL_BlitInfo *info)
{
    SETUP_SSE2_BLITTER;
    const __m128i mm128_rgbmask =
        _mm_set1_epi32(~(info->src->Amask | info->dst->Amask));

    RUN_SSE2_BLITTER({
        mm128_src = _mm_and_si128(mm128_src, mm128_rgbmask);
        mm128_dst = _mm_adds_epu8(mm128_dst, mm128_src);
    })
}

This macro strategy would be more consistent with the AVX macro strategy and I believe it would be better suited to expand to the SSE2 alpha blitters in the future. I understand this will be more lines of code than what you have now, but I don't think it's bloated at all.

I understand you're using the same strategy you used in the SSE2 fill and I didn't complain then, but this has a better argument for wanting to be more generic.

Also, tell me about the const. Any performance impact on that? We haven't been using const on our __mm128is before.

@itzpr3d4t0r
Copy link
Member Author

I don't like having the DUFF loop in the non-4-at-a-time pixel segments. It seems wasteful to me to duplicate the code a lot just so it can be run through once per height. The 4-at-a-time pixel segments could go through 800 pixels every row, the non-4-at-a-time will only ever go through 3 at most.

I'm fine with this, I'll test the perf difference out of curiosity but I'll change that no matter what.

I'd like to keep the functions themselves (like the function definitions, the names) handwritten in the code, rather than having them defined by the macro. This way you can jump around the code in an IDE to those functions or search for them directly. This is also a style preference. Thoughts?

I understand you're using the same strategy you used in the SSE2 fill and I didn't complain then, but this has a better argument for wanting to be more generic.

I was trying to replicate the filler strategy. My idea was to keep the macros generic for now. This way, if we want to modify a function like ADD in the future, we can simply remove one line and apply your strategy of using macros within a function definition. This means we won’t have to change all the macros, just replace individual instances with custom code. I understand your point and partially agree. However, one can simply look at the SSE blitters file to see that the blitters are defined with macros, which should give a clear picture of what’s happening.

Also, tell me about the const. Any performance impact on that? We haven't been using const on our __mm128is before.

The only reason I did that was to help the people working on this stuff when reading to make them know what's constant and what's not. So basically to help them read the code.

@Starbuck5
Copy link
Member

Starbuck5 commented Jan 17, 2024

I did not do any performance testing, I trust your results.

I did do some correctness testing, with the following program:

import random
import hashlib

import pygame

random.seed(36)

test_flags = [
    pygame.BLEND_RGB_ADD,
    pygame.BLEND_RGBA_ADD,
    pygame.BLEND_RGB_SUB,
    pygame.BLEND_RGBA_SUB,
    pygame.BLEND_RGB_MAX,
    pygame.BLEND_RGBA_MAX,
    pygame.BLEND_RGB_MIN,
    pygame.BLEND_RGBA_MIN,
    pygame.BLEND_RGB_MULT,
    pygame.BLEND_RGBA_MULT,
]

expected_hashes = [
    "a32c63faa4e6485f523d89d6abf36915e3577666a15f582a6355f0928460ac0e",
    "7a28ea1660965a1d696751fd752243e71a57b15ce093b4bf0161b77be44a0866",
    "3eb1ce3593d4e5c849484476633426e206c28055f21670dacf4b75a889153150",
    "d167fe901dbe6eabf4072642f6281ff259ee95f18488b56b8d2c8085c5100cf8",
    "28ac1f3bd5b264780c270abeddfcbbf0db5fa60470a4d5bd95de85e560b205f6",
    "5d0524379429a3d446573630bb0c0a84986685bffaf064f4c9e6a417a040a327",
    "caa5a5c310ae532e3310ce1b4b52d0f76680e613c1d12349aa171b9d937c3250",
    "1e8a730cbad33abc8be6e01338a8d9d2065cebf314d2dcae4c8c09e4ff648a0e",
    "d4e17e77fd276f631860d51e861087cfb0e3c7bae04728a917fc645c8e92bbdb",
    "2b63502cb372733967e476d076a044ae51566cbc70aa4e6e5295748505a3aaff",
]

surf_size = (20, 10)
offset = (3, 7)


def populate_surf(surf):
    for y in range(surf.get_height()):
        for x in range(surf.get_width()):
            surf.set_at(
                (x, y),
                (
                    random.randint(0, 255),
                    random.randint(0, 255),
                    random.randint(0, 255),
                    random.randint(0, 255),
                ),
            )

hashes = []

for flag in test_flags:
    source = pygame.Surface(surf_size, pygame.SRCALPHA)
    dest = pygame.Surface(surf_size, pygame.SRCALPHA)

    populate_surf(source)
    populate_surf(dest)

    dest.blit(source, offset, special_flags=flag)

    sha256 = hashlib.sha256()
    sha256.update(pygame.image.tobytes(dest, "RGBA"))
    digest = sha256.hexdigest()

    assert digest == expected_hashes.pop(0)

    hashes.append(digest)
    print(digest)

print(hashes)

I tested the results with SSE2 on main, AVX2 on main, and SSE2 on this branch. All of them had the same hashes.

Copy link
Member

@Starbuck5 Starbuck5 left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice code quality improvement! (And also perf improvement for mult)

@itzpr3d4t0r itzpr3d4t0r added the Code quality/robustness Code quality and resilience to changes label Jan 17, 2024
@itzpr3d4t0r itzpr3d4t0r force-pushed the rewrite_sse2_blitters_with_macros branch from 02b7420 to c648be6 Compare January 28, 2024 11:59
Copy link
Member

@MyreMylar MyreMylar left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM.

I can confirm the small performance enhancement on my machine, and the reduction in code size speaks for itself. Nice work! 🎉

@MyreMylar MyreMylar merged commit 30e73da into pygame-community:main Feb 4, 2024
30 checks passed
@itzpr3d4t0r itzpr3d4t0r added this to the 2.5.0 milestone Feb 4, 2024
@itzpr3d4t0r itzpr3d4t0r deleted the rewrite_sse2_blitters_with_macros branch March 5, 2024 10:34
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Code quality/robustness Code quality and resilience to changes Performance Related to the speed or resource usage of the project SIMD Surface pygame.Surface
Projects
None yet
Development

Successfully merging this pull request may close these issues.

None yet

3 participants