From 04ccb47c6d7885eed08bd6c27f8f2ef2921b655f Mon Sep 17 00:00:00 2001 From: Oscar Benjamin Date: Mon, 26 Sep 2022 00:23:56 +0100 Subject: [PATCH] Use subquadratic algorithms for int(string) As identified in gh-95778 the algorithm used for decimal to binary conversion by int(string) has quadratic complexity. Following on from the reafctor of PyLong_FromString in gh-96808 this commit implements a subquadratic algorithm for parsing strings from decimal and other bases leveraging the subquadratic complexity of integer multiplication. --- Objects/longobject.c | 235 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 230 insertions(+), 5 deletions(-) diff --git a/Objects/longobject.c b/Objects/longobject.c index 77a8782d8a675d..e4cdec0e4f7fa0 100644 --- a/Objects/longobject.c +++ b/Objects/longobject.c @@ -83,6 +83,24 @@ maybe_small_long(PyLongObject *v) #define KARATSUBA_CUTOFF 70 #define KARATSUBA_SQUARE_CUTOFF (2 * KARATSUBA_CUTOFF) +/* For parsing strings in decimal or other non binary bases a quadratic + * algorithm is used for strings with fewer than BASE_QUADRATIC_CUTOFF + * character digits. The subquadratic algorithm that is used for larger input + * strings first reads in chunks of BASE_QUADRATIC_CHUNKSIZE digits using the + * quadratic algorithm and then combines those using int multiplication. The + * optimal value of these limits has not been systematically explored but at + * least for decimal it seems that a chunk size significantly smaller or larger + * (e.g. 100 or 100000) is slower. + * + * For best performance over all bases this limit should probably be expressed + * in units of PyLong digits rather than string digits because the optimal + * number of string digits would be base dependent. However only base 10 is of + * significant interest and these values will probably give performance not far + * from optimal for other bases (3 and 36 are the extremal cases). + */ +#define BASE_QUADRATIC_CUTOFF 10000 +#define BASE_QUADRATIC_CHUNKSIZE 500 + /* For exponentiation, use the binary left-to-right algorithm unless the ^ exponent contains more than HUGE_EXP_CUTOFF bits. In that case, do * (no more than) EXP_WINDOW_SIZE bits at a time. The potential drawback is @@ -2207,7 +2225,8 @@ unsigned char _PyLong_DigitValue[256] = { * 0 else (exception may be set, in that case *res is set to NULL) */ static int -long_from_binary_base(const char *start, const char *end, Py_ssize_t digits, int base, PyLongObject **res) +long_from_binary_base(const char *start, const char *end, + Py_ssize_t digits, int base, PyLongObject **res) { const char *p; int bits_per_char; @@ -2273,7 +2292,7 @@ long_from_binary_base(const char *start, const char *end, Py_ssize_t digits, int } /*** -long_from_non_binary_base: parameters and return values are the same as +long_from_base_quadratic: parameters and return values are the same as long_from_binary_base. Binary bases can be converted in time linear in the number of digits, because @@ -2361,7 +2380,9 @@ just 1 digit at the start, so that the copying code was exercised for every digit beyond the first. ***/ static int -long_from_non_binary_base(const char *start, const char *end, Py_ssize_t digits, int base, PyLongObject **res) +long_from_base_quadratic(const char *start, const char *end, + Py_ssize_t digits, int base, + PyLongObject **res) { twodigits c; /* current input character */ Py_ssize_t size_z; @@ -2494,6 +2515,203 @@ long_from_non_binary_base(const char *start, const char *end, Py_ssize_t digits, return 0; } + +static PyLongObject * k_mul(PyLongObject *a, PyLongObject *b); +static PyLongObject * x_add(PyLongObject *a, PyLongObject *b); +static PyObject * long_pow(PyObject *v, PyObject *w, PyObject *x); + +/* long_from_base_subquadratic: parameters and return values are the same as + * long_from_binary_base. + * + * This function is used for parsing larger strings into an int from a base + * that is not a power of 2 (e.g. decimal) when the number of digits is greater + * than BASE_QUADRATIC_CUTOFF. The idea is to parse segments of the input + * string having BASE_QUADRATIC_CHUNKSIZE digits into separate PyLongs using + * the basic quadratic algorithm (long_from_base_quadratic). This gives an + * initial representation of the final result in base + * base**BASE_QUADRATIC_CHUNKSIZE with each digit being a PyLong (which is + * internally binary). Those digits are combined to build up the final result + * using integer multiplication. This is asymptotically faster than the basic + * quadratic algorithm because it leverages the subquadratic complexity of + * multiplication of large integers. + * + * A pure Python implementation of this algorithm is: + * + * def parse_int(S: str, B: int = 10) -> int: + * """parse string S as an integer in base B""" + * m = len(S) + * l = list(map(int, S[::-1])) + * b, k = B, m + * while k > 1: + * last = [l[-1]] if k % 2 == 1 else [] + * l = [l1 + b*l2 for l1, l2 in zip(l[::2], l[1::2])] + * l.extend(last) + * b, k = b**2, (k + 1) // 2 + * [l0] = l + * return l0 + * + * The final result that we want to compute is l0. At the intermediate stages + * of the algorithm l0 is represented by l which is a list of (binary) integers + * representing l0 in base b. The key step is: + * + * l = [l1 + b*l2 for l1, l2 in zip(l[::2], l[1::2])] + * + * This lifts l from a representation of l0 in base b to a representation in + * base b**2 so the base goes from 10 to 100 to 10000 etc and eventually the + * base is large enough that l0 is represented by a single base b digit. + * + * The algorithm is not intrinsically subquadratic but rather delegates the + * heavy lifting to integer multiplication which is subquadratic for + * sufficiently large integers so the complexity is M(n)*log(n) where M(n) is + * the complexity of multiplying two n bit integers. + * + * This is essentially algorithm 1.25 (FastIntegerInput) from section 1.7.2 of + * Modern Computer Arithmetic, Richard P. Brent and Paul Zimmermann. + */ +static int +long_from_base_subquadratic(const char *start, const char *end, + Py_ssize_t digits, int base, PyLongObject **res) +{ + Py_ssize_t chunk_size = BASE_QUADRATIC_CHUNKSIZE; + Py_ssize_t num_chunks = digits / chunk_size + 1; + Py_ssize_t i, k, digits_read; + const char *p; + + PyLongObject **l = NULL; + PyLongObject *base_l = NULL; + PyLongObject *chunk_size_l = NULL; + PyLongObject *B = NULL; + PyLongObject *t1 = NULL; + PyLongObject *t2 = NULL; + + /* l will be initially an array of num_chunks ints representing the final + * result in base base^chunk_size. */ + l = (PyLongObject **) PyMem_Malloc(num_chunks * sizeof(PyLongObject *)); + if (l == NULL) { + goto error; + } + for(i=0; i 1) { + /* + * last = [l[-1]] if k % 2 == 1 else [] + * l = [l1 + b*l2 for l1, l2 in zip(l[::2], l[1::2])] + * l.extend(last) + */ + for(i=0; i 1) { + t1 = k_mul(B, B); + if (t1 == NULL) { + goto error; + } + Py_DECREF(B); + B = t1; + t1 = NULL; + } + } + + /* Success! Now l == [l0]. */ + Py_DECREF(B); + *res = l[0]; + PyMem_Free(l); + return 0; + +error: + Py_XDECREF(base_l); + Py_XDECREF(chunk_size_l); + Py_XDECREF(B); + Py_XDECREF(t1); + Py_XDECREF(t2); + if (l != NULL) { + for (i = 0; i < num_chunks; ++i) { + Py_XDECREF(l[i]); + } + PyMem_Free(l); + } + *res = NULL; + return 0; +} + /* *str points to the first digit in a string of base `base` digits. base is an * integer from 2 to 36 inclusive. Here we don't need to worry about prefixes * like 0x or leading +- signs. The string should be null terminated consisting @@ -2586,8 +2804,15 @@ long_from_string_base(const char **str, int base, PyLongObject **res) return 0; } } - /* Use the quadratic algorithm for non binary bases. */ - return long_from_non_binary_base(start, end, digits, base, res); + + if (digits <= BASE_QUADRATIC_CUTOFF) { + /* Use the quadratic algorithm for smaller strings. */ + return long_from_base_quadratic(start, end, digits, base, res); + } + else { + /* Use the subquadratic algorithm for larger strings. */ + return long_from_base_subquadratic(start, end, digits, base, res); + } } }