From 029a4119d924dd4809aa41dd0dab42dd10d733ca Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Mon, 17 Nov 2025 10:19:10 +0000 Subject: [PATCH 01/10] WIP enable SSE4.2 --- mypyc/lib-rt/base64/config.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mypyc/lib-rt/base64/config.h b/mypyc/lib-rt/base64/config.h index fd516c4be2d6..c70f48fa57aa 100644 --- a/mypyc/lib-rt/base64/config.h +++ b/mypyc/lib-rt/base64/config.h @@ -7,7 +7,7 @@ #define BASE64_WITH_SSE41 0 #define HAVE_SSE41 BASE64_WITH_SSE41 -#define BASE64_WITH_SSE42 0 +#define BASE64_WITH_SSE42 1 #define HAVE_SSE42 BASE64_WITH_SSE42 #define BASE64_WITH_AVX 0 From 315bfe50e6e06b1a6cba2441afc9b04b9375e01f Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Mon, 17 Nov 2025 10:20:25 +0000 Subject: [PATCH 02/10] WIP Add SSE4.2 files --- mypyc/build.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mypyc/build.py b/mypyc/build.py index 8505a2d95701..9ce73849ca82 100644 --- a/mypyc/build.py +++ b/mypyc/build.py @@ -77,6 +77,9 @@ class ModDesc(NamedTuple): "base64/arch/generic/enc_tail.c", "base64/arch/generic/dec_head.c", "base64/arch/generic/dec_tail.c", + "base64/arch/ssse3/dec_reshuffle.c", + "base64/arch/ssse3/dec_loop.c", + "base64/arch/ssse3/enc_loop_asm.c", "base64/arch/neon64/dec_loop.c", "base64/arch/neon64/enc_loop_asm.c", "base64/codecs.h", From a60de54d0f500efac407b12fc4429469b49f9f51 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Mon, 17 Nov 2025 10:23:53 +0000 Subject: [PATCH 03/10] WIP target SSE4.2 --- mypyc/build.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mypyc/build.py b/mypyc/build.py index 9ce73849ca82..a476c4a5b1fc 100644 --- a/mypyc/build.py +++ b/mypyc/build.py @@ -645,6 +645,7 @@ def mypycify( cflags += [ f"-O{opt_level}", f"-g{debug_level}", + "-msse4.2", # TODO "-Werror", "-Wno-unused-function", "-Wno-unused-label", From b5af1bc0bbb0d64bac5511b7af0e0bfa5df3428b Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Mon, 17 Nov 2025 10:44:28 +0000 Subject: [PATCH 04/10] Add x86_64 architecture detection to config.h --- mypyc/lib-rt/base64/config.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mypyc/lib-rt/base64/config.h b/mypyc/lib-rt/base64/config.h index c70f48fa57aa..b5e47fb04e75 100644 --- a/mypyc/lib-rt/base64/config.h +++ b/mypyc/lib-rt/base64/config.h @@ -7,7 +7,12 @@ #define BASE64_WITH_SSE41 0 #define HAVE_SSE41 BASE64_WITH_SSE41 +#if defined(__x86_64__) || defined(_M_X64) #define BASE64_WITH_SSE42 1 +#else +#define BASE64_WITH_SSE42 0 +#endif + #define HAVE_SSE42 BASE64_WITH_SSE42 #define BASE64_WITH_AVX 0 From 19a7550a7d85734e5a12e8b1fdfd2c72a314d113 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Mon, 17 Nov 2025 11:35:10 +0000 Subject: [PATCH 05/10] Enable SSE4.2 selectively --- mypyc/build.py | 10 ++++++++-- mypyc/common.py | 3 +++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/mypyc/build.py b/mypyc/build.py index a476c4a5b1fc..97a515deeff6 100644 --- a/mypyc/build.py +++ b/mypyc/build.py @@ -36,7 +36,7 @@ from mypy.util import write_junit_xml from mypyc.annotate import generate_annotated_html from mypyc.codegen import emitmodule -from mypyc.common import IS_FREE_THREADED, RUNTIME_C_FILES, shared_lib_name +from mypyc.common import IS_FREE_THREADED, RUNTIME_C_FILES, X86_64, shared_lib_name from mypyc.errors import Errors from mypyc.ir.pprint import format_modules from mypyc.namegen import exported_name @@ -645,7 +645,6 @@ def mypycify( cflags += [ f"-O{opt_level}", f"-g{debug_level}", - "-msse4.2", # TODO "-Werror", "-Wno-unused-function", "-Wno-unused-label", @@ -659,6 +658,9 @@ def mypycify( # See https://github.com/mypyc/mypyc/issues/956 "-Wno-cpp", ] + if X86_64: + # Enable SIMD extensions. All CPUs released since ~2010 support SSE4.2. + cflags.append("-msse4.2") if log_trace: cflags.append("-DMYPYC_LOG_TRACE") if experimental_features: @@ -687,6 +689,10 @@ def mypycify( # that we actually get the compilation speed and memory # use wins that multi-file mode is intended for. cflags += ["/GL-", "/wd9025"] # warning about overriding /GL + if X86_64: + # Enable SIMD extensions. All CPUs released since ~2010 support SSE4.2. + # Also Windows 11 requires SSE4.2 since 24H2. + cflags.append("/arch:SSE4.2") if log_trace: cflags.append("/DMYPYC_LOG_TRACE") if experimental_features: diff --git a/mypyc/common.py b/mypyc/common.py index 2de63c09bb2c..7aafef333ea6 100644 --- a/mypyc/common.py +++ b/mypyc/common.py @@ -1,5 +1,6 @@ from __future__ import annotations +import platform import sys import sysconfig from typing import Any, Final @@ -44,6 +45,8 @@ IS_32_BIT_PLATFORM: Final = int(SIZEOF_SIZE_T) == 4 +X86_64: Final = platform.machine() == "x86_64" + PLATFORM_SIZE = 4 if IS_32_BIT_PLATFORM else 8 # Maximum value for a short tagged integer. From 4007ba444268704ce2a783d1bb00bf29e7a00330 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Mon, 17 Nov 2025 11:46:34 +0000 Subject: [PATCH 06/10] Fix Windows and possibly BSDs --- mypyc/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mypyc/common.py b/mypyc/common.py index 7aafef333ea6..98f8a89f6fcb 100644 --- a/mypyc/common.py +++ b/mypyc/common.py @@ -45,7 +45,7 @@ IS_32_BIT_PLATFORM: Final = int(SIZEOF_SIZE_T) == 4 -X86_64: Final = platform.machine() == "x86_64" +X86_64: Final = platform.machine() in ("x86_64", "AMD64", "amd64") PLATFORM_SIZE = 4 if IS_32_BIT_PLATFORM else 8 From f0ba28037270121651b399162c91bd59d95c3f73 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Mon, 17 Nov 2025 12:58:31 +0000 Subject: [PATCH 07/10] Include missing source file --- mypyc/build.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mypyc/build.py b/mypyc/build.py index 97a515deeff6..4f4caefb384c 100644 --- a/mypyc/build.py +++ b/mypyc/build.py @@ -80,6 +80,7 @@ class ModDesc(NamedTuple): "base64/arch/ssse3/dec_reshuffle.c", "base64/arch/ssse3/dec_loop.c", "base64/arch/ssse3/enc_loop_asm.c", + "base64/arch/ssse3/enc_translate.c", "base64/arch/neon64/dec_loop.c", "base64/arch/neon64/enc_loop_asm.c", "base64/codecs.h", From e8f761b4415eb4a54e7447bdbbffcb107bc0777a Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Mon, 17 Nov 2025 13:19:17 +0000 Subject: [PATCH 08/10] Also enable SIMD in mypyc/lib-rt/setup.py --- mypyc/lib-rt/setup.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mypyc/lib-rt/setup.py b/mypyc/lib-rt/setup.py index acd61458e516..6a56c65306ae 100644 --- a/mypyc/lib-rt/setup.py +++ b/mypyc/lib-rt/setup.py @@ -6,6 +6,7 @@ from __future__ import annotations import os +import platform import subprocess import sys from distutils import ccompiler, sysconfig @@ -24,6 +25,8 @@ "pythonsupport.c", ] +X86_64 = platform.machine() in ("x86_64", "AMD64", "amd64") + class BuildExtGtest(build_ext): def get_library_names(self) -> list[str]: @@ -79,8 +82,12 @@ def run(self) -> None: cflags: list[str] = [] if compiler.compiler_type == "unix": cflags += ["-O3"] + if X86_64: + cflags.append("-msse4.2") # Enable SIMD (see also mypyc/build.py) elif compiler.compiler_type == "msvc": cflags += ["/O2"] + if X86_64: + cflags.append("/arch:SSE4.2") # Enable SIMD (see also mypyc/build.py) setup( ext_modules=[ From 39673b7a8b3485c39c15fe25b846ebabff96050e Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Mon, 17 Nov 2025 13:53:38 +0000 Subject: [PATCH 09/10] Add another missing file --- mypyc/build.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mypyc/build.py b/mypyc/build.py index 4f4caefb384c..a95d76df86ba 100644 --- a/mypyc/build.py +++ b/mypyc/build.py @@ -81,6 +81,7 @@ class ModDesc(NamedTuple): "base64/arch/ssse3/dec_loop.c", "base64/arch/ssse3/enc_loop_asm.c", "base64/arch/ssse3/enc_translate.c", + "base64/arch/ssse3/enc_reshuffle.c", "base64/arch/neon64/dec_loop.c", "base64/arch/neon64/enc_loop_asm.c", "base64/codecs.h", From bf3761d2b835cbf403e247634c91b4e03f1dba9d Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Mon, 17 Nov 2025 14:35:41 +0000 Subject: [PATCH 10/10] Add another missing file --- mypyc/build.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mypyc/build.py b/mypyc/build.py index a95d76df86ba..02f427c83426 100644 --- a/mypyc/build.py +++ b/mypyc/build.py @@ -82,6 +82,7 @@ class ModDesc(NamedTuple): "base64/arch/ssse3/enc_loop_asm.c", "base64/arch/ssse3/enc_translate.c", "base64/arch/ssse3/enc_reshuffle.c", + "base64/arch/ssse3/enc_loop.c", "base64/arch/neon64/dec_loop.c", "base64/arch/neon64/enc_loop_asm.c", "base64/codecs.h",