Skip to content

Commit 1999a20

Browse files
mr-cp-sawicki
authored andcommitted
[mypyc] librt base64: use existing SIMD CPU dispatch by customizing build flags (#20253)
Fixes the current SSE4.2 requirement added in 1b6ebb1 / #20244 This PR fully enables the existing x86-64 CPU detection and dispatch code for SSSE3, SSE4.1, SSE4.2, AVX, and AVX2 in the base64 module. To use the existing CPU dispatch from the [upstream base64 code](https://github.com/aklomp/base64), one needs to compile the sources in each of the CPU specific codec directories with a specific compiler flag; alas this is difficult to do with setuptools, but I found a solution inspired by https://stackoverflow.com/a/68508804 Note that I did not enable the AVX512 path with this PR, as many intel CPUs that support AVX512 can come with a performance hit if AVX512 is sporadically used; the performance of the AVX512 (encoding) path need to be evaluated in the context of how mypyc uses base64 in various realistic scenarios. (There is no AVX512 accelerated decoding path in the upstream base64 codebase, it falls back to the avx2 decoder). If there are additional performance concerns, then I suggest benchmarking with the openmp feature of base64 turned on, for multi-core processing.
1 parent 1b94fbb commit 1999a20

File tree

9 files changed

+135
-46
lines changed

9 files changed

+135
-46
lines changed

mypy_self_check.ini

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ pretty = True
88
always_false = MYPYC
99
plugins = mypy.plugins.proper_plugin
1010
python_version = 3.9
11-
exclude = mypy/typeshed/|mypyc/test-data/|mypyc/lib-rt/
11+
exclude = mypy/typeshed/|mypyc/test-data/
1212
enable_error_code = ignore-without-code,redundant-expr
1313
enable_incomplete_feature = PreciseTupleTypes
1414
show_error_code_links = True

mypyc/build.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
from collections.abc import Iterable
2929
from typing import TYPE_CHECKING, Any, NamedTuple, NoReturn, Union, cast
3030

31+
import mypyc.build_setup # noqa: F401
3132
from mypy.build import BuildSource
3233
from mypy.errors import CompileError
3334
from mypy.fscache import FileSystemCache
@@ -36,7 +37,7 @@
3637
from mypy.util import write_junit_xml
3738
from mypyc.annotate import generate_annotated_html
3839
from mypyc.codegen import emitmodule
39-
from mypyc.common import IS_FREE_THREADED, RUNTIME_C_FILES, X86_64, shared_lib_name
40+
from mypyc.common import IS_FREE_THREADED, RUNTIME_C_FILES, shared_lib_name
4041
from mypyc.errors import Errors
4142
from mypyc.ir.pprint import format_modules
4243
from mypyc.namegen import exported_name
@@ -70,6 +71,13 @@ class ModDesc(NamedTuple):
7071
"base64/arch/neon64/codec.c",
7172
],
7273
[
74+
"base64/arch/avx/enc_loop_asm.c",
75+
"base64/arch/avx2/enc_loop.c",
76+
"base64/arch/avx2/enc_loop_asm.c",
77+
"base64/arch/avx2/enc_reshuffle.c",
78+
"base64/arch/avx2/enc_translate.c",
79+
"base64/arch/avx2/dec_loop.c",
80+
"base64/arch/avx2/dec_reshuffle.c",
7381
"base64/arch/generic/32/enc_loop.c",
7482
"base64/arch/generic/64/enc_loop.c",
7583
"base64/arch/generic/32/dec_loop.c",
@@ -661,9 +669,6 @@ def mypycify(
661669
# See https://github.com/mypyc/mypyc/issues/956
662670
"-Wno-cpp",
663671
]
664-
if X86_64:
665-
# Enable SIMD extensions. All CPUs released since ~2010 support SSE4.2.
666-
cflags.append("-msse4.2")
667672
if log_trace:
668673
cflags.append("-DMYPYC_LOG_TRACE")
669674
if experimental_features:
@@ -692,10 +697,6 @@ def mypycify(
692697
# that we actually get the compilation speed and memory
693698
# use wins that multi-file mode is intended for.
694699
cflags += ["/GL-", "/wd9025"] # warning about overriding /GL
695-
if X86_64:
696-
# Enable SIMD extensions. All CPUs released since ~2010 support SSE4.2.
697-
# Also Windows 11 requires SSE4.2 since 24H2.
698-
cflags.append("/arch:SSE4.2")
699700
if log_trace:
700701
cflags.append("/DMYPYC_LOG_TRACE")
701702
if experimental_features:

mypyc/build_setup.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
import platform
2+
import sys
3+
4+
try:
5+
# Import setuptools so that it monkey-patch overrides distutils
6+
import setuptools # noqa: F401
7+
except ImportError:
8+
pass
9+
10+
if sys.version_info >= (3, 12):
11+
# From setuptools' monkeypatch
12+
from distutils import ccompiler # type: ignore[import-not-found]
13+
else:
14+
from distutils import ccompiler
15+
16+
EXTRA_FLAGS_PER_COMPILER_TYPE_PER_PATH_COMPONENT = {
17+
"unix": {
18+
"base64/arch/ssse3": ["-mssse3"],
19+
"base64/arch/sse41": ["-msse4.1"],
20+
"base64/arch/sse42": ["-msse4.2"],
21+
"base64/arch/avx2": ["-mavx2"],
22+
"base64/arch/avx": ["-mavx"],
23+
},
24+
"msvc": {
25+
"base64/arch/sse42": ["/arch:SSE4.2"],
26+
"base64/arch/avx2": ["/arch:AVX2"],
27+
"base64/arch/avx": ["/arch:AVX"],
28+
},
29+
}
30+
31+
ccompiler.CCompiler.__spawn = ccompiler.CCompiler.spawn # type: ignore[attr-defined]
32+
X86_64 = platform.machine() in ("x86_64", "AMD64", "amd64")
33+
34+
35+
def spawn(self, cmd, **kwargs) -> None: # type: ignore[no-untyped-def]
36+
compiler_type: str = self.compiler_type
37+
extra_options = EXTRA_FLAGS_PER_COMPILER_TYPE_PER_PATH_COMPONENT[compiler_type]
38+
new_cmd = list(cmd)
39+
if X86_64 and extra_options is not None:
40+
# filenames are closer to the end of command line
41+
for argument in reversed(new_cmd):
42+
# Check if the matching argument contains a source filename.
43+
if not str(argument).endswith(".c"):
44+
continue
45+
46+
for path in extra_options.keys():
47+
if path in str(argument):
48+
if compiler_type == "bcpp":
49+
compiler = new_cmd.pop()
50+
# Borland accepts a source file name at the end,
51+
# insert the options before it
52+
new_cmd.extend(extra_options[path])
53+
new_cmd.append(compiler)
54+
else:
55+
new_cmd.extend(extra_options[path])
56+
57+
# path component is found, no need to search any further
58+
break
59+
self.__spawn(new_cmd, **kwargs)
60+
61+
62+
ccompiler.CCompiler.spawn = spawn # type: ignore[method-assign]

mypyc/common.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
from __future__ import annotations
22

3-
import platform
43
import sys
54
import sysconfig
65
from typing import Any, Final
@@ -45,8 +44,6 @@
4544

4645
IS_32_BIT_PLATFORM: Final = int(SIZEOF_SIZE_T) == 4
4746

48-
X86_64: Final = platform.machine() in ("x86_64", "AMD64", "amd64")
49-
5047
PLATFORM_SIZE = 4 if IS_32_BIT_PLATFORM else 8
5148

5249
# Maximum value for a short tagged integer.

mypyc/lib-rt/base64/arch/avx/codec.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
#include "../ssse3/dec_loop.c"
2525

2626
#if BASE64_AVX_USE_ASM
27-
# include "enc_loop_asm.c"
27+
# include "./enc_loop_asm.c"
2828
#else
2929
# include "../ssse3/enc_translate.c"
3030
# include "../ssse3/enc_reshuffle.c"

mypyc/lib-rt/base64/arch/avx2/codec.c

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,15 +20,15 @@
2020
# endif
2121
#endif
2222

23-
#include "dec_reshuffle.c"
24-
#include "dec_loop.c"
23+
#include "./dec_reshuffle.c"
24+
#include "./dec_loop.c"
2525

2626
#if BASE64_AVX2_USE_ASM
27-
# include "enc_loop_asm.c"
27+
# include "./enc_loop_asm.c"
2828
#else
29-
# include "enc_translate.c"
30-
# include "enc_reshuffle.c"
31-
# include "enc_loop.c"
29+
# include "./enc_translate.c"
30+
# include "./enc_reshuffle.c"
31+
# include "./enc_loop.c"
3232
#endif
3333

3434
#endif // HAVE_AVX2

mypyc/lib-rt/base64/config.h

Lines changed: 7 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,15 @@
11
#ifndef BASE64_CONFIG_H
22
#define BASE64_CONFIG_H
33

4-
#define BASE64_WITH_SSSE3 0
5-
#define HAVE_SSSE3 BASE64_WITH_SSSE3
6-
7-
#define BASE64_WITH_SSE41 0
8-
#define HAVE_SSE41 BASE64_WITH_SSE41
9-
10-
#if defined(__x86_64__) || defined(_M_X64)
11-
#define BASE64_WITH_SSE42 1
12-
#else
13-
#define BASE64_WITH_SSE42 0
4+
#if !defined(__APPLE__) && ((defined(__x86_64__) && defined(__LP64__)) || defined(_M_X64))
5+
#define HAVE_SSSE3 1
6+
#define HAVE_SSE41 1
7+
#define HAVE_SSE42 1
8+
#define HAVE_AVX 1
9+
#define HAVE_AVX2 1
10+
#define HAVE_AVX512 0
1411
#endif
1512

16-
#define HAVE_SSE42 BASE64_WITH_SSE42
17-
18-
#define BASE64_WITH_AVX 0
19-
#define HAVE_AVX BASE64_WITH_AVX
20-
21-
#define BASE64_WITH_AVX2 0
22-
#define HAVE_AVX2 BASE64_WITH_AVX2
23-
24-
#define BASE64_WITH_AVX512 0
25-
#define HAVE_AVX512 BASE64_WITH_AVX512
26-
2713
#define BASE64_WITH_NEON32 0
2814
#define HAVE_NEON32 BASE64_WITH_NEON32
2915

mypyc/lib-rt/setup.py

Lines changed: 48 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,55 @@
2525
"pythonsupport.c",
2626
]
2727

28+
EXTRA_FLAGS_PER_COMPILER_TYPE_PER_PATH_COMPONENT = {
29+
"unix": {
30+
"base64/arch/ssse3": ["-mssse3"],
31+
"base64/arch/sse41": ["-msse4.1"],
32+
"base64/arch/sse42": ["-msse4.2"],
33+
"base64/arch/avx2": ["-mavx2"],
34+
"base64/arch/avx": ["-mavx"],
35+
},
36+
"msvc": {
37+
"base64/arch/sse42": ["/arch:SSE4.2"],
38+
"base64/arch/avx2": ["/arch:AVX2"],
39+
"base64/arch/avx": ["/arch:AVX"],
40+
},
41+
}
42+
43+
ccompiler.CCompiler.__spawn = ccompiler.CCompiler.spawn # type: ignore[attr-defined]
2844
X86_64 = platform.machine() in ("x86_64", "AMD64", "amd64")
2945

3046

47+
def spawn(self, cmd, **kwargs) -> None: # type: ignore[no-untyped-def]
48+
compiler_type: str = self.compiler_type
49+
extra_options = EXTRA_FLAGS_PER_COMPILER_TYPE_PER_PATH_COMPONENT[compiler_type]
50+
new_cmd = list(cmd)
51+
if X86_64 and extra_options is not None:
52+
# filenames are closer to the end of command line
53+
for argument in reversed(new_cmd):
54+
# Check if the matching argument contains a source filename.
55+
if not str(argument).endswith(".c"):
56+
continue
57+
58+
for path in extra_options.keys():
59+
if path in str(argument):
60+
if compiler_type == "bcpp":
61+
compiler = new_cmd.pop()
62+
# Borland accepts a source file name at the end,
63+
# insert the options before it
64+
new_cmd.extend(extra_options[path])
65+
new_cmd.append(compiler)
66+
else:
67+
new_cmd.extend(extra_options[path])
68+
69+
# path component is found, no need to search any further
70+
break
71+
self.__spawn(new_cmd, **kwargs)
72+
73+
74+
ccompiler.CCompiler.spawn = spawn # type: ignore[method-assign]
75+
76+
3177
class BuildExtGtest(build_ext):
3278
def get_library_names(self) -> list[str]:
3379
return ["gtest"]
@@ -80,14 +126,10 @@ def run(self) -> None:
80126
compiler = ccompiler.new_compiler()
81127
sysconfig.customize_compiler(compiler)
82128
cflags: list[str] = []
83-
if compiler.compiler_type == "unix":
129+
if compiler.compiler_type == "unix": # type: ignore[attr-defined]
84130
cflags += ["-O3"]
85-
if X86_64:
86-
cflags.append("-msse4.2") # Enable SIMD (see also mypyc/build.py)
87-
elif compiler.compiler_type == "msvc":
131+
elif compiler.compiler_type == "msvc": # type: ignore[attr-defined]
88132
cflags += ["/O2"]
89-
if X86_64:
90-
cflags.append("/arch:SSE4.2") # Enable SIMD (see also mypyc/build.py)
91133

92134
setup(
93135
ext_modules=[

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ def run(self) -> None:
9999
os.path.join("mypyc", "lib-rt", "setup.py"),
100100
# Uses __file__ at top level https://github.com/mypyc/mypyc/issues/700
101101
os.path.join("mypyc", "__main__.py"),
102+
os.path.join("mypyc", "build_setup.py"), # for monkeypatching
102103
)
103104

104105
everything = [os.path.join("mypy", x) for x in find_package_data("mypy", ["*.py"])] + [

0 commit comments

Comments
 (0)