From 0e4a73074f62fd6c75b177301e01d4c7f4703a39 Mon Sep 17 00:00:00 2001
From: ruema <ruema>
Date: Thu, 25 Jul 2024 21:10:47 +0000
Subject: [PATCH 1/8] Speed up float pack and unpack

---
 Include/internal/pycore_floatobject.h  |   5 -
 Include/internal/pycore_runtime.h      |   1 -
 Include/internal/pycore_runtime_init.h |   4 -
 Objects/floatobject.c                  | 758 ++++++++++++-------------
 Tools/c-analyzer/TODO                  |   4 -
 5 files changed, 374 insertions(+), 398 deletions(-)
diff --git a/Include/internal/pycore_floatobject.h b/Include/internal/pycore_floatobject.h
index be1c6cc97720d2..b9be2348345162 100644
--- a/Include/internal/pycore_floatobject.h
+++ b/Include/internal/pycore_floatobject.h
@@ -25,11 +25,6 @@ enum _py_float_format_type {
     _py_float_format_ieee_little_endian,
 };
 
-struct _Py_float_runtime_state {
-    enum _py_float_format_type float_format;
-    enum _py_float_format_type double_format;
-};
-
 
 
 
diff --git a/Include/internal/pycore_runtime.h b/Include/internal/pycore_runtime.h
index d4ffd977940a02..db66a291f2be58 100644
--- a/Include/internal/pycore_runtime.h
+++ b/Include/internal/pycore_runtime.h
@@ -276,7 +276,6 @@ typedef struct pyruntimestate {
     } audit_hooks;
 
     struct _py_object_runtime_state object_state;
-    struct _Py_float_runtime_state float_state;
     struct _Py_unicode_runtime_state unicode_state;
     struct _types_runtime_state types;
 
diff --git a/Include/internal/pycore_runtime_init.h b/Include/internal/pycore_runtime_init.h
index da2b8d5570de62..a4dd7224d9588e 100644
--- a/Include/internal/pycore_runtime_init.h
+++ b/Include/internal/pycore_runtime_init.h
@@ -152,10 +152,6 @@ extern PyTypeObject _PyExc_MemoryError;
         .stoptheworld = { \
             .is_global = 1, \
         }, \
-        .float_state = { \
-            .float_format = _py_float_format_unknown, \
-            .double_format = _py_float_format_unknown, \
-        }, \
         .types = { \
             .next_version_tag = 1, \
         }, \
diff --git a/Objects/floatobject.c b/Objects/floatobject.c
index 82f39de421f245..497122d5641e30 100644
--- a/Objects/floatobject.c
+++ b/Objects/floatobject.c
@@ -18,6 +18,7 @@
 
 #include <float.h>                // DBL_MAX
 #include <stdlib.h>               // strtol()
+#include <stdint.h>               // uint64_t
 
 /*[clinic input]
 class float "PyObject *" "&PyFloat_Type"
@@ -1683,8 +1684,81 @@ typedef enum _py_float_format_type float_format_type;
 #define ieee_big_endian_format _py_float_format_ieee_big_endian
 #define ieee_little_endian_format _py_float_format_ieee_little_endian
 
-#define float_format (_PyRuntime.float_state.float_format)
-#define double_format (_PyRuntime.float_state.double_format)
+static inline uint32_t byte_swap_uint32(uint32_t x) {
+    return (
+        ((x >> 24) & 0xff)
+        | ((x >> 8) & 0xff00)
+        | ((x & 0xff00) << 8)
+        | ((x & 0xff) << 24)
+    );
+}
+
+static inline uint64_t byte_swap_uint64(uint64_t x) {
+    return (
+        ((x >> 56) & 0xff)
+        | ((x >> 40) & 0xff00)
+        | ((x >> 24) & 0xff0000)
+        | ((x >> 8) & 0xff000000)
+        | ((x & 0xff000000) << 8)
+        | ((x & 0xff0000) << 24)
+        | ((x & 0xff00) << 40)
+        | ((x & 0xff) << 56)
+    );
+}
+
+/* We attempt to determine if this machine is using IEEE
+    floating point formats by peering at the bits of some
+    carefully chosen values.  If it looks like we are on an
+    IEEE platform, the float packing/unpacking routines can
+    just copy bits, if not they resort to arithmetic & shifts
+    and masks.  The shifts & masks approach works on all finite
+    values, but what happens to infinities, NaNs and signed
+    zeroes on packing is an accident, and attempting to unpack
+    a NaN or an infinity will raise an exception.
+
+    Note that if we're on some whacked-out platform which uses
+    IEEE formats but isn't strictly little-endian or big-
+    endian, we will fall back to the portable shifts & masks
+    method.
+
+    These functions are reduced to a no-op via compiler
+    optimizations. */
+
+static inline float_format_type get_float_format(void) {
+    if (sizeof(float) == 4) {
+        float y = 16711938.0;
+        uint32_t z;
+        memcpy(&z, &y, 4);
+        if (z == 0x4b7f0102)
+        {
+            if (memcmp(&y, "\x4b\x7f\x01\x02", 4) == 0) {
+                return ieee_big_endian_format;
+            } else
+            if (memcmp(&y, "\x02\x01\x7f\x4b", 4) == 0) {
+                return ieee_little_endian_format;
+            }
+        }
+    }
+    return unknown_format;
+}
+
+static inline float_format_type get_double_format(void) {
+    if (sizeof(float) == 4) {
+        double y = 9006104071832581.0;
+        uint64_t z;
+        memcpy(&z, &y, 8);
+        if (z == 0x433fff0102030405)
+        {
+            if (memcmp(&y, "\x43\x3f\xff\x01\x02\x03\x04\x05", 8) == 0) {
+                return ieee_big_endian_format;
+            } else
+            if (memcmp(&y, "\x05\x04\x03\x02\x01\xff\x3f\x43", 8) == 0) {
+                return ieee_little_endian_format;
+            }
+        }
+    }
+    return unknown_format;
+}
 
 
 /*[clinic input]
@@ -1711,10 +1785,10 @@ float___getformat___impl(PyTypeObject *type, const char *typestr)
     float_format_type r;
 
     if (strcmp(typestr, "double") == 0) {
-        r = double_format;
+        r = get_double_format();
     }
     else if (strcmp(typestr, "float") == 0) {
-        r = float_format;
+        r = get_float_format();
     }
     else {
         PyErr_SetString(PyExc_ValueError,
@@ -1887,57 +1961,6 @@ PyTypeObject PyFloat_Type = {
     .tp_vectorcall = (vectorcallfunc)float_vectorcall,
 };
 
-static void
-_init_global_state(void)
-{
-    float_format_type detected_double_format, detected_float_format;
-
-    /* We attempt to determine if this machine is using IEEE
-       floating-point formats by peering at the bits of some
-       carefully chosen values.  If it looks like we are on an
-       IEEE platform, the float packing/unpacking routines can
-       just copy bits, if not they resort to arithmetic & shifts
-       and masks.  The shifts & masks approach works on all finite
-       values, but what happens to infinities, NaNs and signed
-       zeroes on packing is an accident, and attempting to unpack
-       a NaN or an infinity will raise an exception.
-
-       Note that if we're on some whacked-out platform which uses
-       IEEE formats but isn't strictly little-endian or big-
-       endian, we will fall back to the portable shifts & masks
-       method. */
-
-#if SIZEOF_DOUBLE == 8
-    {
-        double x = 9006104071832581.0;
-        if (memcmp(&x, "\x43\x3f\xff\x01\x02\x03\x04\x05", 8) == 0)
-            detected_double_format = ieee_big_endian_format;
-        else if (memcmp(&x, "\x05\x04\x03\x02\x01\xff\x3f\x43", 8) == 0)
-            detected_double_format = ieee_little_endian_format;
-        else
-            detected_double_format = unknown_format;
-    }
-#else
-    detected_double_format = unknown_format;
-#endif
-
-#if SIZEOF_FLOAT == 4
-    {
-        float y = 16711938.0;
-        if (memcmp(&y, "\x4b\x7f\x01\x02", 4) == 0)
-            detected_float_format = ieee_big_endian_format;
-        else if (memcmp(&y, "\x02\x01\x7f\x4b", 4) == 0)
-            detected_float_format = ieee_little_endian_format;
-        else
-            detected_float_format = unknown_format;
-    }
-#else
-    detected_float_format = unknown_format;
-#endif
-
-    double_format = detected_double_format;
-    float_format = detected_float_format;
-}
 
 void
 _PyFloat_InitState(PyInterpreterState *interp)
@@ -1945,7 +1968,6 @@ _PyFloat_InitState(PyInterpreterState *interp)
     if (!_Py_IsMainInterpreter(interp)) {
         return;
     }
-    _init_global_state();
 }
 
 PyStatus
@@ -2099,105 +2121,98 @@ PyFloat_Pack2(double x, char *data, int le)
 int
 PyFloat_Pack4(double x, char *data, int le)
 {
-    unsigned char *p = (unsigned char *)data;
-    if (float_format == unknown_format) {
-        unsigned char sign;
-        int e;
-        double f;
-        unsigned int fbits;
-        int incr = 1;
-
-        if (le) {
-            p += 3;
-            incr = -1;
-        }
-
-        if (x < 0) {
-            sign = 1;
-            x = -x;
-        }
-        else
-            sign = 0;
-
-        f = frexp(x, &e);
-
-        /* Normalize f to be in the range [1.0, 2.0) */
-        if (0.5 <= f && f < 1.0) {
-            f *= 2.0;
-            e--;
-        }
-        else if (f == 0.0)
-            e = 0;
-        else {
-            PyErr_SetString(PyExc_SystemError,
-                            "frexp() result out of range");
-            return -1;
-        }
-
-        if (e >= 128)
+    float_format_type format = get_float_format();
+    if (format != unknown_format) {
+        float z = x;
+        if (isinf(z) && ! isinf(x))
             goto Overflow;
-        else if (e < -126) {
-            /* Gradual underflow */
-            f = ldexp(f, 126 + e);
-            e = 0;
-        }
-        else if (!(e == 0 && f == 0.0)) {
-            e += 127;
-            f -= 1.0; /* Get rid of leading 1 */
-        }
-
-        f *= 8388608.0; /* 2**23 */
-        fbits = (unsigned int)(f + 0.5); /* Round */
-        assert(fbits <= 8388608);
-        if (fbits >> 23) {
-            /* The carry propagated out of a string of 23 1 bits. */
-            fbits = 0;
-            ++e;
-            if (e >= 255)
-                goto Overflow;
+        uint32_t *p = (uint32_t *)data;
+        uint32_t s;
+        memcpy(&s, &z, 4);
+        if ((format == ieee_big_endian_format && le)
+            || (format == ieee_little_endian_format && !le)) {
+            s = byte_swap_uint32(s);
         }
+        *p = s;
+        return 0;
+    }
 
-        /* First byte */
-        *p = (sign << 7) | (e >> 1);
-        p += incr;
 
-        /* Second byte */
-        *p = (char) (((e & 1) << 7) | (fbits >> 16));
-        p += incr;
+    unsigned char *p = (unsigned char *)data;
+    unsigned char sign;
+    int e;
+    double f;
+    unsigned int fbits;
+    int incr = 1;
 
-        /* Third byte */
-        *p = (fbits >> 8) & 0xFF;
-        p += incr;
+    if (le) {
+        p += 3;
+        incr = -1;
+    }
 
-        /* Fourth byte */
-        *p = fbits & 0xFF;
+    if (x < 0) {
+        sign = 1;
+        x = -x;
+    }
+    else
+        sign = 0;
 
-        /* Done */
-        return 0;
+    f = frexp(x, &e);
 
+    /* Normalize f to be in the range [1.0, 2.0) */
+    if (0.5 <= f && f < 1.0) {
+        f *= 2.0;
+        e--;
     }
+    else if (f == 0.0)
+        e = 0;
     else {
-        float y = (float)x;
-        int i, incr = 1;
+        PyErr_SetString(PyExc_SystemError,
+                        "frexp() result out of range");
+        return -1;
+    }
+
+    if (e >= 128)
+        goto Overflow;
+    else if (e < -126) {
+        /* Gradual underflow */
+        f = ldexp(f, 126 + e);
+        e = 0;
+    }
+    else if (!(e == 0 && f == 0.0)) {
+        e += 127;
+        f -= 1.0; /* Get rid of leading 1 */
+    }
 
-        if (isinf(y) && !isinf(x))
+    f *= 8388608.0; /* 2**23 */
+    fbits = (unsigned int)(f + 0.5); /* Round */
+    assert(fbits <= 8388608);
+    if (fbits >> 23) {
+        /* The carry propagated out of a string of 23 1 bits. */
+        fbits = 0;
+        ++e;
+        if (e >= 255)
             goto Overflow;
+    }
 
-        unsigned char s[sizeof(float)];
-        memcpy(s, &y, sizeof(float));
+    /* First byte */
+    *p = (sign << 7) | (e >> 1);
+    p += incr;
 
-        if ((float_format == ieee_little_endian_format && !le)
-            || (float_format == ieee_big_endian_format && le)) {
-            p += 3;
-            incr = -1;
-        }
+    /* Second byte */
+    *p = (char) (((e & 1) << 7) | (fbits >> 16));
+    p += incr;
+
+    /* Third byte */
+    *p = (fbits >> 8) & 0xFF;
+    p += incr;
+
+    /* Fourth byte */
+    *p = fbits & 0xFF;
+
+    /* Done */
+    return 0;
 
-        for (i = 0; i < 4; i++) {
-            *p = s[i];
-            p += incr;
-        }
-        return 0;
-    }
   Overflow:
     PyErr_SetString(PyExc_OverflowError,
                     "float too large to pack with f format");
@@ -2207,131 +2222,126 @@ PyFloat_Pack4(double x, char *data, int le)
 int
 PyFloat_Pack8(double x, char *data, int le)
 {
-    unsigned char *p = (unsigned char *)data;
-    if (double_format == unknown_format) {
-        unsigned char sign;
-        int e;
-        double f;
-        unsigned int fhi, flo;
-        int incr = 1;
-
-        if (le) {
-            p += 7;
-            incr = -1;
-        }
-
-        if (x < 0) {
-            sign = 1;
-            x = -x;
+    float_format_type format = get_double_format();
+    if (format != unknown_format) {
+        uint64_t *p = (uint64_t *)data;
+        uint64_t s;
+        memcpy(&s, &x, 8);
+        if ((format == ieee_big_endian_format && le)
+            || (format == ieee_little_endian_format && !le)) {
+            s = byte_swap_uint64(s);
         }
-        else
-            sign = 0;
+        *p = s;
+        return 0;
+    }
 
-        f = frexp(x, &e);
+    unsigned char *p = (unsigned char *)data;
+    unsigned char sign;
+    int e;
+    double f;
+    unsigned int fhi, flo;
+    int incr = 1;
 
-        /* Normalize f to be in the range [1.0, 2.0) */
-        if (0.5 <= f && f < 1.0) {
-            f *= 2.0;
-            e--;
-        }
-        else if (f == 0.0)
-            e = 0;
-        else {
-            PyErr_SetString(PyExc_SystemError,
-                            "frexp() result out of range");
-            return -1;
-        }
+    if (le) {
+        p += 7;
+        incr = -1;
+    }
 
-        if (e >= 1024)
-            goto Overflow;
-        else if (e < -1022) {
-            /* Gradual underflow */
-            f = ldexp(f, 1022 + e);
-            e = 0;
-        }
-        else if (!(e == 0 && f == 0.0)) {
-            e += 1023;
-            f -= 1.0; /* Get rid of leading 1 */
-        }
+    if (x < 0) {
+        sign = 1;
+        x = -x;
+    }
+    else
+        sign = 0;
 
-        /* fhi receives the high 28 bits; flo the low 24 bits (== 52 bits) */
-        f *= 268435456.0; /* 2**28 */
-        fhi = (unsigned int)f; /* Truncate */
-        assert(fhi < 268435456);
-
-        f -= (double)fhi;
-        f *= 16777216.0; /* 2**24 */
-        flo = (unsigned int)(f + 0.5); /* Round */
-        assert(flo <= 16777216);
-        if (flo >> 24) {
-            /* The carry propagated out of a string of 24 1 bits. */
-            flo = 0;
-            ++fhi;
-            if (fhi >> 28) {
-                /* And it also propagated out of the next 28 bits. */
-                fhi = 0;
-                ++e;
-                if (e >= 2047)
-                    goto Overflow;
-            }
-        }
+    f = frexp(x, &e);
 
-        /* First byte */
-        *p = (sign << 7) | (e >> 4);
-        p += incr;
+    /* Normalize f to be in the range [1.0, 2.0) */
+    if (0.5 <= f && f < 1.0) {
+        f *= 2.0;
+        e--;
+    }
+    else if (f == 0.0)
+        e = 0;
+    else {
+        PyErr_SetString(PyExc_SystemError,
+                        "frexp() result out of range");
+        return -1;
+    }
 
-        /* Second byte */
-        *p = (unsigned char) (((e & 0xF) << 4) | (fhi >> 24));
-        p += incr;
+    if (e >= 1024)
+        goto Overflow;
+    else if (e < -1022) {
+        /* Gradual underflow */
+        f = ldexp(f, 1022 + e);
+        e = 0;
+    }
+    else if (!(e == 0 && f == 0.0)) {
+        e += 1023;
+        f -= 1.0; /* Get rid of leading 1 */
+    }
+
+    /* fhi receives the high 28 bits; flo the low 24 bits (== 52 bits) */
+    f *= 268435456.0; /* 2**28 */
+    fhi = (unsigned int)f; /* Truncate */
+    assert(fhi < 268435456);
+
+    f -= (double)fhi;
+    f *= 16777216.0; /* 2**24 */
+    flo = (unsigned int)(f + 0.5); /* Round */
+    assert(flo <= 16777216);
+    if (flo >> 24) {
+        /* The carry propagated out of a string of 24 1 bits. */
+        flo = 0;
+        ++fhi;
+        if (fhi >> 28) {
+            /* And it also propagated out of the next 28 bits. */
+            fhi = 0;
+            ++e;
+            if (e >= 2047)
+                goto Overflow;
+        }
+    }
 
-        /* Third byte */
-        *p = (fhi >> 16) & 0xFF;
-        p += incr;
+    /* First byte */
+    *p = (sign << 7) | (e >> 4);
+    p += incr;
 
-        /* Fourth byte */
-        *p = (fhi >> 8) & 0xFF;
-        p += incr;
+    /* Second byte */
+    *p = (unsigned char) (((e & 0xF) << 4) | (fhi >> 24));
+    p += incr;
 
-        /* Fifth byte */
-        *p = fhi & 0xFF;
-        p += incr;
+    /* Third byte */
+    *p = (fhi >> 16) & 0xFF;
+    p += incr;
 
-        /* Sixth byte */
-        *p = (flo >> 16) & 0xFF;
-        p += incr;
+    /* Fourth byte */
+    *p = (fhi >> 8) & 0xFF;
+    p += incr;
 
-        /* Seventh byte */
-        *p = (flo >> 8) & 0xFF;
-        p += incr;
+    /* Fifth byte */
+    *p = fhi & 0xFF;
+    p += incr;
 
-        /* Eighth byte */
-        *p = flo & 0xFF;
-        /* p += incr; */
+    /* Sixth byte */
+    *p = (flo >> 16) & 0xFF;
+    p += incr;
 
-        /* Done */
-        return 0;
+    /* Seventh byte */
+    *p = (flo >> 8) & 0xFF;
+    p += incr;
 
-      Overflow:
-        PyErr_SetString(PyExc_OverflowError,
-                        "float too large to pack with d format");
-        return -1;
-    }
-    else {
-        const unsigned char *s = (unsigned char*)&x;
-        int i, incr = 1;
+    /* Eighth byte */
+    *p = flo & 0xFF;
+    /* p += incr; */
 
-        if ((double_format == ieee_little_endian_format && !le)
-            || (double_format == ieee_big_endian_format && le)) {
-            p += 7;
-            incr = -1;
-        }
+    /* Done */
+    return 0;
 
-        for (i = 0; i < 8; i++) {
-            *p = *s++;
-            p += incr;
-        }
-        return 0;
-    }
+    Overflow:
+    PyErr_SetString(PyExc_OverflowError,
+                    "float too large to pack with d format");
+    return -1;
 }
 
 double
@@ -2389,174 +2399,154 @@ PyFloat_Unpack2(const char *data, int le)
 double
 PyFloat_Unpack4(const char *data, int le)
 {
-    unsigned char *p = (unsigned char *)data;
-    if (float_format == unknown_format) {
-        unsigned char sign;
-        int e;
-        unsigned int f;
-        double x;
-        int incr = 1;
-
-        if (le) {
-            p += 3;
-            incr = -1;
-        }
-
-        /* First byte */
-        sign = (*p >> 7) & 1;
-        e = (*p & 0x7F) << 1;
-        p += incr;
-
-        /* Second byte */
-        e |= (*p >> 7) & 1;
-        f = (*p & 0x7F) << 16;
-        p += incr;
-
-        if (e == 255) {
-            PyErr_SetString(
-                PyExc_ValueError,
-                "can't unpack IEEE 754 special value "
-                "on non-IEEE platform");
-            return -1;
+    float_format_type format = get_float_format();
+    if (format != unknown_format) {
+        float r;
+        uint32_t s = *(uint32_t *)data;
+        if ((format == ieee_big_endian_format && le)
+            || (format == ieee_little_endian_format && !le)) {
+            s = byte_swap_uint32(s);
         }
+        memcpy(&r, &s, 4);
+        return r;
+    }
 
-        /* Third byte */
-        f |= *p << 8;
-        p += incr;
-
-        /* Fourth byte */
-        f |= *p;
+    unsigned char *p = (unsigned char *)data;
+    unsigned char sign;
+    int e;
+    unsigned int f;
+    double x;
+    int incr = 1;
 
-        x = (double)f / 8388608.0;
+    if (le) {
+        p += 3;
+        incr = -1;
+    }
 
-        /* XXX This sadly ignores Inf/NaN issues */
-        if (e == 0)
-            e = -126;
-        else {
-            x += 1.0;
-            e -= 127;
-        }
-        x = ldexp(x, e);
+    /* First byte */
+    sign = (*p >> 7) & 1;
+    e = (*p & 0x7F) << 1;
+    p += incr;
 
-        if (sign)
-            x = -x;
+    /* Second byte */
+    e |= (*p >> 7) & 1;
+    f = (*p & 0x7F) << 16;
+    p += incr;
 
-        return x;
+    if (e == 255) {
+        PyErr_SetString(
+            PyExc_ValueError,
+            "can't unpack IEEE 754 special value "
+            "on non-IEEE platform");
+        return -1;
     }
-    else {
-        float x;
 
-        if ((float_format == ieee_little_endian_format && !le)
-            || (float_format == ieee_big_endian_format && le)) {
-            char buf[4];
-            char *d = &buf[3];
-            int i;
+    /* Third byte */
+    f |= *p << 8;
+    p += incr;
 
-            for (i = 0; i < 4; i++) {
-                *d-- = *p++;
-            }
-            memcpy(&x, buf, 4);
-        }
-        else {
-            memcpy(&x, p, 4);
-        }
+    /* Fourth byte */
+    f |= *p;
+
+    x = (double)f / 8388608.0;
 
-        return x;
+    /* XXX This sadly ignores Inf/NaN issues */
+    if (e == 0)
+        e = -126;
+    else {
+        x += 1.0;
+        e -= 127;
     }
+    x = ldexp(x, e);
+
+    if (sign)
+        x = -x;
+
+    return x;
 }
 
 double
 PyFloat_Unpack8(const char *data, int le)
 {
-    unsigned char *p = (unsigned char *)data;
-    if (double_format == unknown_format) {
-        unsigned char sign;
-        int e;
-        unsigned int fhi, flo;
-        double x;
-        int incr = 1;
-
-        if (le) {
-            p += 7;
-            incr = -1;
+    float_format_type format = get_double_format();
+    if (format != unknown_format) {
+        double r;
+        uint64_t s = *(uint64_t *)data;
+        if ((format == ieee_big_endian_format && le)
+            || (format == ieee_little_endian_format && !le)) {
+            s = byte_swap_uint64(s);
         }
+        memcpy(&r, &s, 8);
+        return r;
+    }
 
-        /* First byte */
-        sign = (*p >> 7) & 1;
-        e = (*p & 0x7F) << 4;
+    unsigned char *p = (unsigned char *)data;
+    unsigned char sign;
+    int e;
+    unsigned int fhi, flo;
+    double x;
+    int incr = 1;
 
-        p += incr;
+    if (le) {
+        p += 7;
+        incr = -1;
+    }
 
-        /* Second byte */
-        e |= (*p >> 4) & 0xF;
-        fhi = (*p & 0xF) << 24;
-        p += incr;
+    /* First byte */
+    sign = (*p >> 7) & 1;
+    e = (*p & 0x7F) << 4;
 
-        if (e == 2047) {
-            PyErr_SetString(
-                PyExc_ValueError,
-                "can't unpack IEEE 754 special value "
-                "on non-IEEE platform");
-            return -1.0;
-        }
+    p += incr;
 
-        /* Third byte */
-        fhi |= *p << 16;
-        p += incr;
+    /* Second byte */
+    e |= (*p >> 4) & 0xF;
+    fhi = (*p & 0xF) << 24;
+    p += incr;
 
-        /* Fourth byte */
-        fhi |= *p  << 8;
-        p += incr;
+    if (e == 2047) {
+        PyErr_SetString(
+            PyExc_ValueError,
+            "can't unpack IEEE 754 special value "
+            "on non-IEEE platform");
+        return -1.0;
+    }
 
-        /* Fifth byte */
-        fhi |= *p;
-        p += incr;
+    /* Third byte */
+    fhi |= *p << 16;
+    p += incr;
 
-        /* Sixth byte */
-        flo = *p << 16;
-        p += incr;
+    /* Fourth byte */
+    fhi |= *p  << 8;
+    p += incr;
 
-        /* Seventh byte */
-        flo |= *p << 8;
-        p += incr;
+    /* Fifth byte */
+    fhi |= *p;
+    p += incr;
 
-        /* Eighth byte */
-        flo |= *p;
+    /* Sixth byte */
+    flo = *p << 16;
+    p += incr;
 
-        x = (double)fhi + (double)flo / 16777216.0; /* 2**24 */
-        x /= 268435456.0; /* 2**28 */
+    /* Seventh byte */
+    flo |= *p << 8;
+    p += incr;
 
-        if (e == 0)
-            e = -1022;
-        else {
-            x += 1.0;
-            e -= 1023;
-        }
-        x = ldexp(x, e);
+    /* Eighth byte */
+    flo |= *p;
 
-        if (sign)
-            x = -x;
+    x = (double)fhi + (double)flo / 16777216.0; /* 2**24 */
+    x /= 268435456.0; /* 2**28 */
 
-        return x;
-    }
+    if (e == 0)
+        e = -1022;
     else {
-        double x;
-
-        if ((double_format == ieee_little_endian_format && !le)
-            || (double_format == ieee_big_endian_format && le)) {
-            char buf[8];
-            char *d = &buf[7];
-            int i;
+        x += 1.0;
+        e -= 1023;
+    }
+    x = ldexp(x, e);
 
-            for (i = 0; i < 8; i++) {
-                *d-- = *p++;
-            }
-            memcpy(&x, buf, 8);
-        }
-        else {
-            memcpy(&x, p, 8);
-        }
+    if (sign)
+        x = -x;
 
-        return x;
-    }
+    return x;
 }
diff --git a/Tools/c-analyzer/TODO b/Tools/c-analyzer/TODO
index 3d599538510bd9..63e38342f30386 100644
--- a/Tools/c-analyzer/TODO
+++ b/Tools/c-analyzer/TODO
@@ -56,10 +56,6 @@ Modules/posixmodule.c:initialized                                static int init
 Modules/signalmodule.c:initialized                               static int initialized
 Modules/timemodule.c:initialized                                 static int initialized
 Objects/dictobject.c:pydict_global_version                       static uint64_t pydict_global_version
-Objects/floatobject.c:detected_double_format                     static float_format_type detected_double_format
-Objects/floatobject.c:detected_float_format                      static float_format_type detected_float_format
-Objects/floatobject.c:double_format                              static float_format_type double_format
-Objects/floatobject.c:float_format                               static float_format_type
 Objects.longobject.c:_Py_quick_int_allocs                        Py_ssize_t _Py_quick_int_allocs
 Objects.longobject.c:_Py_quick_neg_int_allocs                    Py_ssize_t _Py_quick_neg_int_allocs
 Objects/moduleobject.c:max_module_number                         static Py_ssize_t max_module_number

From 7207b516267dcd5122bd213a4e1648e4156909ee Mon Sep 17 00:00:00 2001
From: ruema <ruema>
Date: Wed, 31 Jul 2024 20:48:35 +0000
Subject: [PATCH 2/8] add always_inline to help gcc optimizing the code

---
 Objects/floatobject.c | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/Objects/floatobject.c b/Objects/floatobject.c
index 497122d5641e30..dac4165e213ed4 100644
--- a/Objects/floatobject.c
+++ b/Objects/floatobject.c
@@ -1684,7 +1684,11 @@ typedef enum _py_float_format_type float_format_type;
 #define ieee_big_endian_format _py_float_format_ieee_big_endian
 #define ieee_little_endian_format _py_float_format_ieee_little_endian
 
-static inline uint32_t byte_swap_uint32(uint32_t x) {
+#ifdef __GNUC__
+__attribute__((always_inline))
+#endif
+static inline uint32_t
+byte_swap_uint32(uint32_t x) {
     return (
         ((x >> 24) & 0xff)
         | ((x >> 8) & 0xff00)
@@ -1693,7 +1697,11 @@ static inline uint32_t byte_swap_uint32(uint32_t x) {
     );
 }
 
-static inline uint64_t byte_swap_uint64(uint64_t x) {
+#ifdef __GNUC__
+__attribute__((always_inline))
+#endif
+static inline uint64_t
+byte_swap_uint64(uint64_t x) {
     return (
         ((x >> 56) & 0xff)
         | ((x >> 40) & 0xff00)
@@ -1724,7 +1732,11 @@ static inline uint64_t byte_swap_uint64(uint64_t x) {
     These functions are reduced to a no-op via compiler
     optimizations. */
 
-static inline float_format_type get_float_format(void) {
+#ifdef __GNUC__
+__attribute__((always_inline))
+#endif
+static inline float_format_type
+get_float_format(void) {
     if (sizeof(float) == 4) {
         float y = 16711938.0;
         uint32_t z;
@@ -1742,7 +1754,11 @@ static inline float_format_type get_float_format(void) {
     return unknown_format;
 }
 
-static inline float_format_type get_double_format(void) {
+#ifdef __GNUC__
+__attribute__((always_inline))
+#endif
+static inline float_format_type
+get_double_format(void) {
     if (sizeof(float) == 4) {
         double y = 9006104071832581.0;
         uint64_t z;

From da52114153c9c6dfcaf28610536267c8a52a0165 Mon Sep 17 00:00:00 2001
From: ruema <ruema>
Date: Wed, 31 Jul 2024 20:58:09 +0000
Subject: [PATCH 3/8] blurb

---
 .../next/C_API/2024-07-26-21-45-44.gh-issue-324234.Mu5JSt.rst    | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 Misc/NEWS.d/next/C_API/2024-07-26-21-45-44.gh-issue-324234.Mu5JSt.rst

diff --git a/Misc/NEWS.d/next/C_API/2024-07-26-21-45-44.gh-issue-324234.Mu5JSt.rst b/Misc/NEWS.d/next/C_API/2024-07-26-21-45-44.gh-issue-324234.Mu5JSt.rst
new file mode 100644
index 00000000000000..9b4eb2565409d7
--- /dev/null
+++ b/Misc/NEWS.d/next/C_API/2024-07-26-21-45-44.gh-issue-324234.Mu5JSt.rst
@@ -0,0 +1 @@
+Improve performance of `PyFloat_Pack4`, `PyFloat_Pack8`, `PyFloat_Unpack4` and `PyFloat_Unpack8`.

From 7deb4da8a11f98dffd0047553efe79bea8f9a70d Mon Sep 17 00:00:00 2001
From: ruema <ruema>
Date: Wed, 31 Jul 2024 21:07:35 +0000
Subject: [PATCH 4/8] remove yet unused _PyFloat_InitState

---
 Include/internal/pycore_floatobject.h | 1 -
 Objects/floatobject.c                 | 8 --------
 Python/pylifecycle.c                  | 2 --
 3 files changed, 11 deletions(-)

diff --git a/Include/internal/pycore_floatobject.h b/Include/internal/pycore_floatobject.h
index b9be2348345162..a9aee5d599eb0a 100644
--- a/Include/internal/pycore_floatobject.h
+++ b/Include/internal/pycore_floatobject.h
@@ -12,7 +12,6 @@ extern "C" {
 
 /* runtime lifecycle */
 
-extern void _PyFloat_InitState(PyInterpreterState *);
 extern PyStatus _PyFloat_InitTypes(PyInterpreterState *);
 extern void _PyFloat_FiniType(PyInterpreterState *);
 
diff --git a/Objects/floatobject.c b/Objects/floatobject.c
index dac4165e213ed4..d850616d4d3e25 100644
--- a/Objects/floatobject.c
+++ b/Objects/floatobject.c
@@ -1978,14 +1978,6 @@ PyTypeObject PyFloat_Type = {
 };
 
 
-void
-_PyFloat_InitState(PyInterpreterState *interp)
-{
-    if (!_Py_IsMainInterpreter(interp)) {
-        return;
-    }
-}
-
 PyStatus
 _PyFloat_InitTypes(PyInterpreterState *interp)
 {
diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c
index 6b641c0775f533..38d4f8bed0f8a9 100644
--- a/Python/pylifecycle.c
+++ b/Python/pylifecycle.c
@@ -697,8 +697,6 @@ pycore_init_global_objects(PyInterpreterState *interp)
 {
     PyStatus status;
 
-    _PyFloat_InitState(interp);
-
     status = _PyUnicode_InitGlobalObjects(interp);
     if (_PyStatus_EXCEPTION(status)) {
         return status;

From 4e8ef916de557f2bbc6c29388c2bc1b0cb331c69 Mon Sep 17 00:00:00 2001
From: ruema <ruema>
Date: Wed, 31 Jul 2024 21:44:13 +0000
Subject: [PATCH 5/8] update blurb

---
 .../next/C_API/2024-07-26-21-45-44.gh-issue-122534.Mu5JSt.rst   | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 Misc/NEWS.d/next/C_API/2024-07-26-21-45-44.gh-issue-122534.Mu5JSt.rst

diff --git a/Misc/NEWS.d/next/C_API/2024-07-26-21-45-44.gh-issue-122534.Mu5JSt.rst b/Misc/NEWS.d/next/C_API/2024-07-26-21-45-44.gh-issue-122534.Mu5JSt.rst
new file mode 100644
index 00000000000000..f05070641d5706
--- /dev/null
+++ b/Misc/NEWS.d/next/C_API/2024-07-26-21-45-44.gh-issue-122534.Mu5JSt.rst
@@ -0,0 +1,2 @@
+`Improve performance of ``PyFloat_Pack4``, ``PyFloat_Pack8``, ``PyFloat_Unpack4`` and ``PyFloat_Unpack8``.
+`

From 83c1fa06001566d51779a9faf5cf38fc2b5c8fd6 Mon Sep 17 00:00:00 2001
From: ruema <ruema>
Date: Wed, 31 Jul 2024 21:44:33 +0000
Subject: [PATCH 6/8] update blurb

---
 .../next/C_API/2024-07-26-21-45-44.gh-issue-324234.Mu5JSt.rst    | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 Misc/NEWS.d/next/C_API/2024-07-26-21-45-44.gh-issue-324234.Mu5JSt.rst

diff --git a/Misc/NEWS.d/next/C_API/2024-07-26-21-45-44.gh-issue-324234.Mu5JSt.rst b/Misc/NEWS.d/next/C_API/2024-07-26-21-45-44.gh-issue-324234.Mu5JSt.rst
deleted file mode 100644
index 9b4eb2565409d7..00000000000000
--- a/Misc/NEWS.d/next/C_API/2024-07-26-21-45-44.gh-issue-324234.Mu5JSt.rst
+++ /dev/null
@@ -1 +0,0 @@
-Improve performance of `PyFloat_Pack4`, `PyFloat_Pack8`, `PyFloat_Unpack4` and `PyFloat_Unpack8`.

From 64101d48c8c4797b02aa7e0fc9433a2c1108deef Mon Sep 17 00:00:00 2001
From: ruema <ruema>
Date: Wed, 31 Jul 2024 21:57:46 +0000
Subject: [PATCH 7/8] update blurb

---
 .../2024-07-26-21-45-44.gh-issue-122534.Mu5JSt.rst                | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename Misc/NEWS.d/next/{C_API => C API}/2024-07-26-21-45-44.gh-issue-122534.Mu5JSt.rst (100%)

diff --git a/Misc/NEWS.d/next/C_API/2024-07-26-21-45-44.gh-issue-122534.Mu5JSt.rst b/Misc/NEWS.d/next/C API/2024-07-26-21-45-44.gh-issue-122534.Mu5JSt.rst
similarity index 100%
rename from Misc/NEWS.d/next/C_API/2024-07-26-21-45-44.gh-issue-122534.Mu5JSt.rst
rename to Misc/NEWS.d/next/C API/2024-07-26-21-45-44.gh-issue-122534.Mu5JSt.rst

From c450f74748ccfa481de757b70f4d47d720dc62d0 Mon Sep 17 00:00:00 2001
From: ruema <ruema>
Date: Wed, 31 Jul 2024 21:58:15 +0000
Subject: [PATCH 8/8] update blurb

---
 .../next/C API/2024-07-26-21-45-44.gh-issue-122534.Mu5JSt.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Misc/NEWS.d/next/C API/2024-07-26-21-45-44.gh-issue-122534.Mu5JSt.rst b/Misc/NEWS.d/next/C API/2024-07-26-21-45-44.gh-issue-122534.Mu5JSt.rst
index f05070641d5706..66be85fb80b33b 100644
--- a/Misc/NEWS.d/next/C API/2024-07-26-21-45-44.gh-issue-122534.Mu5JSt.rst	
+++ b/Misc/NEWS.d/next/C API/2024-07-26-21-45-44.gh-issue-122534.Mu5JSt.rst	
@@ -1,2 +1,2 @@
-`Improve performance of ``PyFloat_Pack4``, ``PyFloat_Pack8``, ``PyFloat_Unpack4`` and ``PyFloat_Unpack8``.
-`
+Improve performance of :c:func:`PyFloat_Pack4`, :c:func:`PyFloat_Pack8`, :c:func:`PyFloat_Unpack4` and :c:func:`PyFloat_Unpack8`.
+