From 9d0bb1c3f150f1bfc0c1c6f1fad46c47b3ff9978 Mon Sep 17 00:00:00 2001 From: "d.grigonis" Date: Tue, 4 Jun 2024 13:21:14 +0300 Subject: [PATCH 01/28] alpha version ready for review --- Objects/stringlib/fastsearch.h | 561 ++++++++++++++++++++------------- Objects/stringlib/stringdefs.h | 4 +- 2 files changed, 344 insertions(+), 221 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 309ed1554f4699..18b27e5b30e717 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -178,21 +178,30 @@ STRINGLIB(rfind_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch) #undef MEMRCHR_CUT_OFF -/* Change to a 1 to see logging comments walk through the algorithm. */ -#if 0 && STRINGLIB_SIZEOF_CHAR == 1 -# define LOG(...) printf(__VA_ARGS__) -# define LOG_STRING(s, n) printf("\"%.*s\"", (int)(n), s) -# define LOG_LINEUP() do { \ - LOG("> "); LOG_STRING(haystack, len_haystack); LOG("\n> "); \ - LOG("%*s",(int)(window_last - haystack + 1 - len_needle), ""); \ - LOG_STRING(needle, len_needle); LOG("\n"); \ -} while(0) + +/* Change to 1 or 2 to see logging comments walk through the algorithm. + * LOG_LEVEL == 1 print excludes input strings (useful for long inputs) + * LOG_LEVEL == 2 print includes input alignments */ +# define LOG_LEVEL 0 +#if LOG_LEVEL == 1 && STRINGLIB_SIZEOF_CHAR == 1 + # define LOG(...) printf(__VA_ARGS__) + # define LOG_STRING(s, n) + # define LOG_LINEUP() +#elif LOG_LEVEL == 2 && STRINGLIB_SIZEOF_CHAR == 1 + # define LOG(...) printf(__VA_ARGS__) + # define LOG_STRING(s, n) printf("\"%.*s\"", (int)(n), s) + # define LOG_LINEUP() do { \ + LOG("> "); LOG_STRING(haystack, len_haystack); LOG("\n> "); \ + LOG("%*s",(int)(window_last - haystack + 1 - len_needle), ""); \ + LOG_STRING(needle, len_needle); LOG("\n"); \ + } while(0) #else -# define LOG(...) -# define LOG_STRING(s, n) -# define LOG_LINEUP() + # define LOG(...) + # define LOG_STRING(s, n) + # define LOG_LINEUP() #endif + Py_LOCAL_INLINE(Py_ssize_t) STRINGLIB(_lex_search)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle, Py_ssize_t *return_period, int invert_alphabet) @@ -245,6 +254,7 @@ STRINGLIB(_lex_search)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle, return max_suffix; } + Py_LOCAL_INLINE(Py_ssize_t) STRINGLIB(_factorize)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle, @@ -299,6 +309,7 @@ STRINGLIB(_factorize)(const STRINGLIB_CHAR *needle, LOG("split: "); LOG_STRING(needle, cut); LOG(" + "); LOG_STRING(needle + cut, len_needle - cut); LOG("\n"); + LOG("Period: %ld\n", period); *return_period = period; return cut; @@ -324,224 +335,350 @@ typedef struct STRINGLIB(_pre) { static void -STRINGLIB(_preprocess)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle, - STRINGLIB(prework) *p) +STRINGLIB(_preprocess)(const STRINGLIB_CHAR *needle, + Py_ssize_t len_needle, + STRINGLIB(prework) *pw, + int critical_fac, + int bc_table_gs_gap) { - p->needle = needle; - p->len_needle = len_needle; - p->cut = STRINGLIB(_factorize)(needle, len_needle, &(p->period)); - assert(p->period + p->cut <= len_needle); - p->is_periodic = (0 == memcmp(needle, - needle + p->period, - p->cut * STRINGLIB_SIZEOF_CHAR)); - if (p->is_periodic) { - assert(p->cut <= len_needle/2); - assert(p->cut < p->period); - } - else { - // A lower bound on the period - p->period = Py_MAX(p->cut, len_needle - p->cut) + 1; - } - // The gap between the last character and the previous - // occurrence of an equivalent character (modulo TABLE_SIZE) - p->gap = len_needle; - STRINGLIB_CHAR last = needle[len_needle - 1] & TABLE_MASK; - for (Py_ssize_t i = len_needle - 2; i >= 0; i--) { - STRINGLIB_CHAR x = needle[i] & TABLE_MASK; - if (x == last) { - p->gap = len_needle - 1 - i; - break; + // Set the Needle & Calculate Critical Factorization + if (critical_fac) { + pw->needle = needle; + pw->len_needle = len_needle; + pw->cut = STRINGLIB(_factorize)(needle, len_needle, &(pw->period)); + assert(pw->period + pw->cut <= len_needle); + pw->is_periodic = (0 == memcmp(needle, + needle + pw->period, + pw->cut * STRINGLIB_SIZEOF_CHAR)); + if (pw->is_periodic) { + assert(pw->cut <= len_needle/2); + assert(pw->cut < pw->period); + } + else { + // A lower bound on the period + pw->period = Py_MAX(pw->cut, len_needle - pw->cut) + 1; } } - // Fill up a compressed Boyer-Moore "Bad Character" table - Py_ssize_t not_found_shift = Py_MIN(len_needle, MAX_SHIFT); - for (Py_ssize_t i = 0; i < (Py_ssize_t)TABLE_SIZE; i++) { - p->table[i] = Py_SAFE_DOWNCAST(not_found_shift, - Py_ssize_t, SHIFT_TYPE); - } - for (Py_ssize_t i = len_needle - not_found_shift; i < len_needle; i++) { - SHIFT_TYPE shift = Py_SAFE_DOWNCAST(len_needle - 1 - i, + if (bc_table_gs_gap) { + // Initialize "Good Suffix" Last Character Gap + // Note: gap("___aa") = 1 + pw->gap = len_needle; + STRINGLIB_CHAR last = needle[len_needle - 1] & TABLE_MASK; + for (Py_ssize_t i = len_needle - 2; i >= 0; i--) { + STRINGLIB_CHAR x = needle[i] & TABLE_MASK; + if (x == last) { + pw->gap = len_needle - 1 - i; + break; + } + } + LOG("Good Suffix Gap: %ld\n", pw->gap); + // Fill up a compressed Boyer-Moore "Bad Character" table + Py_ssize_t not_found_shift = Py_MIN(len_needle, MAX_SHIFT); + for (Py_ssize_t i = 0; i < (Py_ssize_t)TABLE_SIZE; i++) { + pw->table[i] = Py_SAFE_DOWNCAST(not_found_shift, Py_ssize_t, SHIFT_TYPE); - p->table[needle[i] & TABLE_MASK] = shift; + } + for (Py_ssize_t i = len_needle - not_found_shift; i < len_needle; i++) { + SHIFT_TYPE shift = Py_SAFE_DOWNCAST(len_needle - 1 - i, + Py_ssize_t, SHIFT_TYPE); + pw->table[needle[i] & TABLE_MASK] = shift; + } } } + static Py_ssize_t -STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, - STRINGLIB(prework) *p) +STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, + Py_ssize_t len_haystack, + Py_ssize_t maxcount, + int mode, + STRINGLIB(prework) *pw) { // Crochemore and Perrin's (1991) Two-Way algorithm. // See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 - const Py_ssize_t len_needle = p->len_needle; - const Py_ssize_t cut = p->cut; - Py_ssize_t period = p->period; - const STRINGLIB_CHAR *const needle = p->needle; + const Py_ssize_t len_needle = pw->len_needle; + const Py_ssize_t cut = pw->cut; + Py_ssize_t period = pw->period; + const STRINGLIB_CHAR *const needle = pw->needle; const STRINGLIB_CHAR *window_last = haystack + len_needle - 1; const STRINGLIB_CHAR *const haystack_end = haystack + len_haystack; - SHIFT_TYPE *table = p->table; + SHIFT_TYPE *table = pw->table; const STRINGLIB_CHAR *window; LOG("===== Two-way: \"%s\" in \"%s\". =====\n", needle, haystack); - - Py_ssize_t gap = p->gap; + if (mode == FAST_COUNT){ + LOG("###### Counting \"%s\" in \"%s\".\n", needle, haystack); + } + else { + LOG("###### Finding \"%s\" in \"%s\".\n", needle, haystack); + } + Py_ssize_t count = 0; + Py_ssize_t gap = pw->gap; + Py_ssize_t shift, i; + Py_ssize_t iloop=0, ihits = 0; Py_ssize_t gap_jump_end = Py_MIN(len_needle, cut + gap); - if (p->is_periodic) { + int is_periodic = pw->is_periodic; + Py_ssize_t memory = 0; + if (is_periodic) { LOG("Needle is periodic.\n"); - Py_ssize_t memory = 0; - periodicwindowloop: - while (window_last < haystack_end) { - assert(memory == 0); - for (;;) { - LOG_LINEUP(); - Py_ssize_t shift = table[(*window_last) & TABLE_MASK]; - window_last += shift; - if (shift == 0) { - break; - } - if (window_last >= haystack_end) { - return -1; - } - LOG("Horspool skip\n"); - } - no_shift: - window = window_last - len_needle + 1; - assert((window[len_needle - 1] & TABLE_MASK) == - (needle[len_needle - 1] & TABLE_MASK)); - Py_ssize_t i = Py_MAX(cut, memory); - for (; i < len_needle; i++) { - if (needle[i] != window[i]) { - if (i < gap_jump_end) { - LOG("Early right half mismatch: jump by gap.\n"); - assert(gap >= i - cut + 1); - window_last += gap; - } - else { - LOG("Late right half mismatch: jump by n (>gap)\n"); - assert(i - cut + 1 > gap); - window_last += i - cut + 1; - } - memory = 0; - goto periodicwindowloop; + } + else { + LOG("Needle is not periodic.\n"); + period = Py_MAX(gap, period); + } + while (window_last < haystack_end) { + assert(memory == 0); + LOG_LINEUP(); + iloop++; + shift = table[(*window_last) & TABLE_MASK]; + window_last += shift; + if (shift != 0){ + LOG("Horspool skip\n"); + continue; + } + if (window_last >= haystack_end){ + break; + } + no_shift: + window = window_last - len_needle + 1; + assert((window[len_needle - 1] & TABLE_MASK) == + (needle[len_needle - 1] & TABLE_MASK)); + if (is_periodic) { + i = Py_MAX(cut, memory); + } else { + i = cut; + } + for (; i < len_needle; i++) { + iloop += 2; + if (needle[i] != window[i]) { + if (i < gap_jump_end) { + LOG("Early right half mismatch: jump by gap.\n"); + assert(gap >= i - cut + 1); + window_last += gap; } - } - for (i = memory; i < cut; i++) { - if (needle[i] != window[i]) { - LOG("Left half does not match.\n"); - window_last += period; - memory = len_needle - period; - if (window_last >= haystack_end) { - return -1; - } - Py_ssize_t shift = table[(*window_last) & TABLE_MASK]; - if (shift) { - // A mismatch has been identified to the right - // of where i will next start, so we can jump - // at least as far as if the mismatch occurred - // on the first comparison. - Py_ssize_t mem_jump = Py_MAX(cut, memory) - cut + 1; - LOG("Skip with Memory.\n"); - memory = 0; - window_last += Py_MAX(shift, mem_jump); - goto periodicwindowloop; - } - goto no_shift; + else { + LOG("Late right half mismatch: jump by n (>gap)\n"); + assert(i - cut + 1 > gap); + window_last += i - cut + 1; } + memory = 0; + break; } - LOG("Found a match!\n"); - return window - haystack; } - } - else { - period = Py_MAX(gap, period); - LOG("Needle is not periodic.\n"); - windowloop: - while (window_last < haystack_end) { - for (;;) { - LOG_LINEUP(); - Py_ssize_t shift = table[(*window_last) & TABLE_MASK]; - window_last += shift; - if (shift == 0) { + if (i != len_needle){ + continue; + } + if (is_periodic) { + i = memory; + } else { + i = 0; + } + for (; i < cut; i++) { + ihits++; + if (needle[i] != window[i]) { + LOG("Left half does not match.\n"); + window_last += period; + if (!is_periodic){ break; } + memory = len_needle - period; if (window_last >= haystack_end) { - return -1; - } - LOG("Horspool skip\n"); - } - window = window_last - len_needle + 1; - assert((window[len_needle - 1] & TABLE_MASK) == - (needle[len_needle - 1] & TABLE_MASK)); - Py_ssize_t i = cut; - for (; i < len_needle; i++) { - if (needle[i] != window[i]) { - if (i < gap_jump_end) { - LOG("Early right half mismatch: jump by gap.\n"); - assert(gap >= i - cut + 1); - window_last += gap; - } - else { - LOG("Late right half mismatch: jump by n (>gap)\n"); - assert(i - cut + 1 > gap); - window_last += i - cut + 1; - } - goto windowloop; + break; } - } - for (Py_ssize_t i = 0; i < cut; i++) { - if (needle[i] != window[i]) { - LOG("Left half does not match.\n"); - window_last += period; - goto windowloop; + iloop++; + Py_ssize_t shift = table[(*window_last) & TABLE_MASK]; + if (!shift){ + goto no_shift; } + // A mismatch has been identified to the right + // of where i will next start, so we can jump + // at least as far as if the mismatch occurred + // on the first comparison. + Py_ssize_t mem_jump = Py_MAX(cut, memory) - cut + 1; + LOG("Skip with Memory.\n"); + memory = 0; + window_last += Py_MAX(shift, mem_jump); + break; } + } + if (i == cut) { LOG("Found a match!\n"); - return window - haystack; + if (mode != FAST_COUNT) { + return window - haystack; + } + count++; + if (count == maxcount) { + return maxcount; + } + window_last += len_needle; } } - LOG("Not found. Returning -1.\n"); - return -1; + // Loop Counter and Memory Access Counter Logging (Used for calibration) + // In worst case scenario iloop == n - m + // iloop == ihits indicates linear performance for quadratic problems + LOG("iloop: %ld\n", iloop); + LOG("ihits: %ld\n", ihits); + if (mode == FAST_COUNT) { + LOG("Counting finished.\n"); + return count; + } + else { + LOG("Not found. Returning -1.\n"); + return -1; + } } static Py_ssize_t -STRINGLIB(_two_way_find)(const STRINGLIB_CHAR *haystack, - Py_ssize_t len_haystack, - const STRINGLIB_CHAR *needle, - Py_ssize_t len_needle) +STRINGLIB(two_way_find)(const STRINGLIB_CHAR *haystack, + Py_ssize_t len_haystack, + const STRINGLIB_CHAR *needle, + Py_ssize_t len_needle, + Py_ssize_t maxcount, + int mode) { - LOG("###### Finding \"%s\" in \"%s\".\n", needle, haystack); - STRINGLIB(prework) p; - STRINGLIB(_preprocess)(needle, len_needle, &p); - return STRINGLIB(_two_way)(haystack, len_haystack, &p); + STRINGLIB(prework) pw; + STRINGLIB(_preprocess)(needle, len_needle, &pw, 1, 1); + return STRINGLIB(_two_way)(haystack, len_haystack, maxcount, mode, &pw); } static Py_ssize_t -STRINGLIB(_two_way_count)(const STRINGLIB_CHAR *haystack, - Py_ssize_t len_haystack, - const STRINGLIB_CHAR *needle, - Py_ssize_t len_needle, - Py_ssize_t maxcount) +STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, + const STRINGLIB_CHAR* p, Py_ssize_t m, + Py_ssize_t maxcount, int mode, int dynamic) { - LOG("###### Counting \"%s\" in \"%s\".\n", needle, haystack); - STRINGLIB(prework) p; - STRINGLIB(_preprocess)(needle, len_needle, &p); - Py_ssize_t index = 0, count = 0; - while (1) { - Py_ssize_t result; - result = STRINGLIB(_two_way)(haystack + index, - len_haystack - index, &p); - if (result == -1) { - return count; - } - count++; - if (count == maxcount) { - return maxcount; - } - index += result + len_needle; + /* Boyer–Moore–Horspool algorithm + with optional dynamic fallback to Two-Way algorithm */ + Py_ssize_t mlast = m - 1; + const Py_ssize_t width = n - m; + const STRINGLIB_CHAR plast = p[mlast]; + const STRINGLIB_CHAR *ss = s + mlast; // Last window character + STRINGLIB_CHAR slast; + + // Pre-Work + STRINGLIB(prework) pw; + STRINGLIB(_preprocess)(p, m, &pw, 0, 1); + Py_ssize_t gap = pw.gap; + SHIFT_TYPE *table = pw.table; + + // Use Bloom for len(needle) <= 64 + // Initialization is costly for long needles + // And this is not much beneficial for large set(needle) + unsigned long mask = 0; + Py_ssize_t bloom_gap = 0; + if (m <= 64) { + LOG("Using Bloom skip\n"); + // Note: bloom_gap("___aa") = 1 + bloom_gap = m; + STRINGLIB_BLOOM_ADD(mask, plast); + for (Py_ssize_t i = 0; i < mlast; i++) { + STRINGLIB_BLOOM_ADD(mask, p[i]); + if (p[i] == plast) { + bloom_gap = mlast - i; + } + } } - return count; + // Horspool Calibration + const float hrs_lcost = 4.0; // average loop cost + const float hrs_hcost = 0.4; // false positive hit cost + const int ih_min = 100; // minimum FP hits to fallback + // Two-Way Calibration + const float twy_icost = 3.0 * m; // initialization cost + const float twy_lcost = 3.0; // loop cost + /* Running variables */ + float loops_left, exp_hrs, exp_twy; + Py_ssize_t count = 0; + Py_ssize_t shift, i, j; + Py_ssize_t iloop=0, ihits=0; + for (i = 0; i <= width;) { + iloop++; + slast = ss[i]; + if (bloom_gap) { + if (slast != plast){ + if (!STRINGLIB_BLOOM(mask, ss[i+1])){ + i += m + 1; + LOG("Bloom skip\n"); + } + else { + shift = table[slast & TABLE_MASK]; + i += shift; + if (shift == 0){ + i += 1; + } + LOG("Modified Horspool skip\n"); + } + continue; + } + assert(slast == plast); + } + else { + shift = table[slast & TABLE_MASK]; + i += shift; + if (shift != 0){ + LOG("Horspool skip\n"); + continue; + } + assert((ss[i] & TABLE_MASK) == (p[mlast] & TABLE_MASK)); + } + if (i > width){ + break; + } + for (j = 0; j < m; j++) { + ihits++; + if (s[i + j] != p[j]) { + break; + } + } + if (j == m) { + LOG("Found a match at %ld!\n", i); + if (mode != FAST_COUNT) { + return i; + } + count++; + if (count == maxcount) { + return maxcount; + } + i += m; + } + else if (bloom_gap && !STRINGLIB_BLOOM(mask, ss[i+1])) { + LOG("move by (m + 1) = %ld\n", m + 1); + i += m + 1; + } + else { + if (bloom_gap) { + LOG("move by bloom gap = %ld\n", gap); + i += bloom_gap; + } else { + LOG("move by gap = %ld\n", gap); + i += gap; + } + } + if (dynamic && ihits > ih_min) { + loops_left = width - i + 1; + exp_hrs = (iloop * hrs_lcost + ihits * hrs_hcost) / i * loops_left; + exp_twy = twy_icost + loops_left * twy_lcost; + if (exp_twy < exp_hrs) { + STRINGLIB(_preprocess)(p, m, &pw, 1, 0); + Py_ssize_t res = STRINGLIB(_two_way)( + s + i, n - i, maxcount - count, mode, &pw); + if (mode == FAST_SEARCH) { + return res == -1 ? -1 : res + i; + } + else { + return res + count; + } + } + } + } + // Loop Counter and False Hit Counter Logging + // In worst case scenario iloop > n - m. + // Used for calibration and fallback decision + LOG("%ld\n", iloop); + LOG("%ld\n", ihits); + return mode == FAST_COUNT ? count : -1; } + #undef SHIFT_TYPE #undef NOT_FOUND #undef SHIFT_OVERFLOW @@ -552,6 +689,8 @@ STRINGLIB(_two_way_count)(const STRINGLIB_CHAR *haystack, #undef LOG #undef LOG_STRING #undef LOG_LINEUP +#undef LOG_LEVEL + static inline Py_ssize_t STRINGLIB(default_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, @@ -560,18 +699,20 @@ STRINGLIB(default_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, { const Py_ssize_t w = n - m; Py_ssize_t mlast = m - 1, count = 0; - Py_ssize_t gap = mlast; const STRINGLIB_CHAR last = p[mlast]; const STRINGLIB_CHAR *const ss = &s[mlast]; + // Initialize Bloom + // Note: gap("___aa") = 0 + Py_ssize_t gap = mlast; unsigned long mask = 0; + STRINGLIB_BLOOM_ADD(mask, last); for (Py_ssize_t i = 0; i < mlast; i++) { STRINGLIB_BLOOM_ADD(mask, p[i]); if (p[i] == last) { gap = mlast - i - 1; } } - STRINGLIB_BLOOM_ADD(mask, last); for (Py_ssize_t i = 0; i <= w; i++) { if (ss[i] == last) { @@ -657,13 +798,12 @@ STRINGLIB(adaptive_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, } hits += j + 1; if (hits > m / 4 && w - i > 2000) { + res = STRINGLIB(two_way_find)( + s + i, n - i, p, m, maxcount, mode); if (mode == FAST_SEARCH) { - res = STRINGLIB(_two_way_find)(s + i, n - i, p, m); return res == -1 ? -1 : res + i; } else { - res = STRINGLIB(_two_way_count)(s + i, n - i, p, m, - maxcount - count); return res + count; } } @@ -761,7 +901,6 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, if (n < m || (mode == FAST_COUNT && maxcount == 0)) { return -1; } - /* look for special cases */ if (m <= 1) { if (m <= 0) { @@ -776,36 +915,20 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, return STRINGLIB(count_char)(s, n, p[0], maxcount); } } - + else if (n == m) { + /* use special case when both strings are of equal length */ + int res = memcmp(s, p, m * sizeof(STRINGLIB_CHAR)); + if (mode == FAST_COUNT){ + return res == 0 ? 1 : 0; + } else { + return res == 0 ? 0 : -1; + } + } if (mode != FAST_RSEARCH) { - if (n < 2500 || (m < 100 && n < 30000) || m < 6) { - return STRINGLIB(default_find)(s, n, p, m, maxcount, mode); - } - else if ((m >> 2) * 3 < (n >> 2)) { - /* 33% threshold, but don't overflow. */ - /* For larger problems where the needle isn't a huge - percentage of the size of the haystack, the relatively - expensive O(m) startup cost of the two-way algorithm - will surely pay off. */ - if (mode == FAST_SEARCH) { - return STRINGLIB(_two_way_find)(s, n, p, m); - } - else { - return STRINGLIB(_two_way_count)(s, n, p, m, maxcount); - } - } - else { - /* To ensure that we have good worst-case behavior, - here's an adaptive version of the algorithm, where if - we match O(m) characters without any matches of the - entire needle, then we predict that the startup cost of - the two-way algorithm will probably be worth it. */ - return STRINGLIB(adaptive_find)(s, n, p, m, maxcount, mode); - } + return STRINGLIB(horspool_find)(s, n, p, m, maxcount, mode, 1); } else { /* FAST_RSEARCH */ return STRINGLIB(default_rfind)(s, n, p, m, maxcount, mode); } } - diff --git a/Objects/stringlib/stringdefs.h b/Objects/stringlib/stringdefs.h index 484b98b7291309..230b9978bcde2a 100644 --- a/Objects/stringlib/stringdefs.h +++ b/Objects/stringlib/stringdefs.h @@ -6,8 +6,8 @@ compiled as unicode. */ #define STRINGLIB_IS_UNICODE 0 -#define FASTSEARCH fastsearch -#define STRINGLIB(F) stringlib_##F +#define FASTSEARCH fastsearch +#define STRINGLIB(F) stringlib_##F #define STRINGLIB_OBJECT PyBytesObject #define STRINGLIB_SIZEOF_CHAR 1 #define STRINGLIB_CHAR char From 3bb688ab7e55ff53bd752852d5765501a3bf692e Mon Sep 17 00:00:00 2001 From: "d.grigonis" Date: Wed, 5 Jun 2024 01:23:16 +0300 Subject: [PATCH 02/28] rfind implemented --- Objects/stringlib/fastsearch.h | 613 +++++++++++++++++++++++++++++---- 1 file changed, 539 insertions(+), 74 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 18b27e5b30e717..409b52ac4a87c1 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -184,21 +184,28 @@ STRINGLIB(rfind_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch) * LOG_LEVEL == 2 print includes input alignments */ # define LOG_LEVEL 0 #if LOG_LEVEL == 1 && STRINGLIB_SIZEOF_CHAR == 1 - # define LOG(...) printf(__VA_ARGS__) - # define LOG_STRING(s, n) - # define LOG_LINEUP() +# define LOG(...) printf(__VA_ARGS__) +# define LOG_STRING(s, n) +# define LOG_LINEUP() +# define LOG_LINEUP_REV() #elif LOG_LEVEL == 2 && STRINGLIB_SIZEOF_CHAR == 1 - # define LOG(...) printf(__VA_ARGS__) - # define LOG_STRING(s, n) printf("\"%.*s\"", (int)(n), s) - # define LOG_LINEUP() do { \ - LOG("> "); LOG_STRING(haystack, len_haystack); LOG("\n> "); \ - LOG("%*s",(int)(window_last - haystack + 1 - len_needle), ""); \ - LOG_STRING(needle, len_needle); LOG("\n"); \ - } while(0) +# define LOG(...) printf(__VA_ARGS__) +# define LOG_STRING(s, n) printf("\"%.*s\"", (int)(n), s) +# define LOG_LINEUP() do { \ + LOG("> "); LOG_STRING(haystack, len_haystack); LOG("\n> "); \ + LOG("%*s",(int)(window_last - haystack + 1 - len_needle), ""); \ + LOG_STRING(needle, len_needle); LOG("\n"); \ +} while(0) +# define LOG_LINEUP_REV() do { \ + LOG("> "); LOG_STRING(haystack, len_haystack); LOG("\n> "); \ + LOG("%*s",(int)(window - haystack), ""); \ + LOG_STRING(needle, len_needle); LOG("\n"); \ +} while(0) #else - # define LOG(...) - # define LOG_STRING(s, n) - # define LOG_LINEUP() +# define LOG(...) +# define LOG_STRING(s, n) +# define LOG_LINEUP() +# define LOG_LINEUP_REV() #endif @@ -255,6 +262,59 @@ STRINGLIB(_lex_search)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle, } +Py_LOCAL_INLINE(Py_ssize_t) +STRINGLIB(_lex_search_rev)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle, + Py_ssize_t *return_period, int invert_alphabet) +{ + /* Do a lexicographic search. Essentially this: + >>> max(needle[i:] for i in range(len(needle)+1)) + Also find the period of the right half. */ + Py_ssize_t max_suffix = 0; + Py_ssize_t candidate = 1; + Py_ssize_t k = 0; + // The period of the right half. + Py_ssize_t period = 1; + Py_ssize_t offset = len_needle - 1; + while (candidate + k < len_needle) { + // each loop increases candidate + k + max_suffix + STRINGLIB_CHAR a = needle[offset - candidate - k]; + STRINGLIB_CHAR b = needle[offset - max_suffix - k]; + // check if the suffix at candidate is better than max_suffix + if (invert_alphabet ? (b < a) : (a < b)) { + // Fell short of max_suffix. + // The next k + 1 characters are non-increasing + // from candidate, so they won't start a maximal suffix. + candidate += k + 1; + k = 0; + // We've ruled out any period smaller than what's + // been scanned since max_suffix. + period = candidate - max_suffix; + } + else if (a == b) { + if (k + 1 != period) { + // Keep scanning the equal strings + k++; + } + else { + // Matched a whole period. + // Start matching the next period. + candidate += period; + k = 0; + } + } + else { + // Did better than max_suffix, so replace it. + max_suffix = candidate; + candidate++; + k = 0; + period = 1; + } + } + *return_period = period; + return len_needle - max_suffix - 1; +} + + Py_LOCAL_INLINE(Py_ssize_t) STRINGLIB(_factorize)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle, @@ -295,7 +355,6 @@ STRINGLIB(_factorize)(const STRINGLIB_CHAR *needle, Py_ssize_t cut1, period1, cut2, period2, cut, period; cut1 = STRINGLIB(_lex_search)(needle, len_needle, &period1, 0); cut2 = STRINGLIB(_lex_search)(needle, len_needle, &period2, 1); - // Take the later cut. if (cut1 > cut2) { period = period1; @@ -309,7 +368,35 @@ STRINGLIB(_factorize)(const STRINGLIB_CHAR *needle, LOG("split: "); LOG_STRING(needle, cut); LOG(" + "); LOG_STRING(needle + cut, len_needle - cut); LOG("\n"); - LOG("Period: %ld\n", period); + LOG("Cut: %ld & Period: %ld\n", cut, period); + + *return_period = period; + return cut; +} + + +Py_LOCAL_INLINE(Py_ssize_t) +STRINGLIB(_factorize_rev)(const STRINGLIB_CHAR *needle, + Py_ssize_t len_needle, + Py_ssize_t *return_period) +{ + Py_ssize_t cut1, period1, cut2, period2, cut, period; + cut1 = STRINGLIB(_lex_search_rev)(needle, len_needle, &period1, 0); + cut2 = STRINGLIB(_lex_search_rev)(needle, len_needle, &period2, 1); + // Take the later cut. + if (cut1 < cut2) { + period = period1; + cut = cut1; + } + else { + period = period2; + cut = cut2; + } + + LOG("split: "); LOG_STRING(needle, cut + 1); + LOG(" + "); LOG_STRING(needle + cut + 1, len_needle); + LOG("\n"); + LOG("Cut: %ld & Period: %ld\n", cut, period); *return_period = period; return cut; @@ -338,6 +425,7 @@ static void STRINGLIB(_preprocess)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle, STRINGLIB(prework) *pw, + int direction, int critical_fac, int bc_table_gs_gap) { @@ -345,44 +433,89 @@ STRINGLIB(_preprocess)(const STRINGLIB_CHAR *needle, if (critical_fac) { pw->needle = needle; pw->len_needle = len_needle; - pw->cut = STRINGLIB(_factorize)(needle, len_needle, &(pw->period)); - assert(pw->period + pw->cut <= len_needle); - pw->is_periodic = (0 == memcmp(needle, - needle + pw->period, - pw->cut * STRINGLIB_SIZEOF_CHAR)); - if (pw->is_periodic) { - assert(pw->cut <= len_needle/2); - assert(pw->cut < pw->period); + if (direction == 1) { + pw->cut = STRINGLIB(_factorize)(needle, len_needle, &(pw->period)); + assert(pw->period + pw->cut <= len_needle); + //bbb c + // | + //bbb + // bb c + // + //c bbb + // | + pw->is_periodic = (0 == memcmp(needle, + needle + pw->period, + pw->cut * STRINGLIB_SIZEOF_CHAR)); + if (pw->is_periodic) { + assert(pw->cut <= len_needle/2); + assert(pw->cut < pw->period); + } + else { + // A lower bound on the period + pw->period = Py_MAX(pw->cut, len_needle - pw->cut) + 1; + } } else { - // A lower bound on the period - pw->period = Py_MAX(pw->cut, len_needle - pw->cut) + 1; + // [0, 1, 2, 3] + pw->cut = STRINGLIB(_factorize_rev)( + needle, len_needle, &(pw->period)); + Py_ssize_t inv_cut = len_needle - pw->cut - 1; + assert(pw->cut - pw->period >= 0); + pw->is_periodic = (0 == memcmp(needle + len_needle - inv_cut, + needle + len_needle - inv_cut - pw->period, + inv_cut * STRINGLIB_SIZEOF_CHAR)); + if (pw->is_periodic) { + assert(pw->cut >= len_needle/2); + assert(len_needle - pw->cut - 1 < pw->period); + } + else { + // A lower bound on the period + pw->period = Py_MIN(pw->cut, len_needle + pw->cut) + 1; + } } } if (bc_table_gs_gap) { - // Initialize "Good Suffix" Last Character Gap - // Note: gap("___aa") = 1 - pw->gap = len_needle; - STRINGLIB_CHAR last = needle[len_needle - 1] & TABLE_MASK; - for (Py_ssize_t i = len_needle - 2; i >= 0; i--) { - STRINGLIB_CHAR x = needle[i] & TABLE_MASK; - if (x == last) { - pw->gap = len_needle - 1 - i; - break; - } - } - LOG("Good Suffix Gap: %ld\n", pw->gap); // Fill up a compressed Boyer-Moore "Bad Character" table Py_ssize_t not_found_shift = Py_MIN(len_needle, MAX_SHIFT); for (Py_ssize_t i = 0; i < (Py_ssize_t)TABLE_SIZE; i++) { pw->table[i] = Py_SAFE_DOWNCAST(not_found_shift, Py_ssize_t, SHIFT_TYPE); } - for (Py_ssize_t i = len_needle - not_found_shift; i < len_needle; i++) { - SHIFT_TYPE shift = Py_SAFE_DOWNCAST(len_needle - 1 - i, - Py_ssize_t, SHIFT_TYPE); - pw->table[needle[i] & TABLE_MASK] = shift; + if (direction == 1) { + for (Py_ssize_t i = len_needle - not_found_shift; i < len_needle; i++) { + SHIFT_TYPE shift = Py_SAFE_DOWNCAST(len_needle - 1 - i, + Py_ssize_t, SHIFT_TYPE); + pw->table[needle[i] & TABLE_MASK] = shift; + } + + // Initialize "Good Suffix" Last Character Gap + // Note: gap("___aa") = 1 + pw->gap = len_needle; + STRINGLIB_CHAR last = needle[len_needle - 1] & TABLE_MASK; + for (Py_ssize_t i = len_needle - 2; i >= 0; i--) { + STRINGLIB_CHAR x = needle[i] & TABLE_MASK; + if (x == last) { + pw->gap = len_needle - 1 - i; + break; + } + } } + else { + for (Py_ssize_t i = not_found_shift - 1; i >= 0; i--) { + SHIFT_TYPE shift = Py_SAFE_DOWNCAST(i, Py_ssize_t, SHIFT_TYPE); + pw->table[needle[i] & TABLE_MASK] = shift; + } + pw->gap = len_needle; + STRINGLIB_CHAR last = needle[0] & TABLE_MASK; + for (Py_ssize_t i = 1; i < len_needle; i++) { + STRINGLIB_CHAR x = needle[i] & TABLE_MASK; + if (x == last) { + pw->gap = i; + break; + } + } + } + LOG("Good Suffix Gap: %ld\n", pw->gap); } } @@ -414,7 +547,8 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t count = 0; Py_ssize_t gap = pw->gap; Py_ssize_t shift, i; - Py_ssize_t iloop=0, ihits = 0; + Py_ssize_t iloop = 0; + Py_ssize_t ihits = 0; Py_ssize_t gap_jump_end = Py_MIN(len_needle, cut + gap); int is_periodic = pw->is_periodic; Py_ssize_t memory = 0; @@ -448,7 +582,7 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, i = cut; } for (; i < len_needle; i++) { - iloop += 2; + ihits++; if (needle[i] != window[i]) { if (i < gap_jump_end) { LOG("Early right half mismatch: jump by gap.\n"); @@ -536,28 +670,190 @@ STRINGLIB(two_way_find)(const STRINGLIB_CHAR *haystack, Py_ssize_t maxcount, int mode) { + int dir = 1; STRINGLIB(prework) pw; - STRINGLIB(_preprocess)(needle, len_needle, &pw, 1, 1); + STRINGLIB(_preprocess)(needle, len_needle, &pw, dir, 1, 1); return STRINGLIB(_two_way)(haystack, len_haystack, maxcount, mode, &pw); } +static Py_ssize_t +STRINGLIB(_two_way_rev)(const STRINGLIB_CHAR *haystack, + Py_ssize_t len_haystack, + Py_ssize_t maxcount, + int mode, + STRINGLIB(prework) *pw) +{ + // Reversed Crochemore and Perrin's (1991) Two-Way algorithm. + // See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 + const Py_ssize_t len_needle = pw->len_needle; + const Py_ssize_t cut = pw->cut; + Py_ssize_t period = pw->period; + const STRINGLIB_CHAR *const needle = pw->needle; + const STRINGLIB_CHAR *const haystack_end = haystack + len_haystack; + const STRINGLIB_CHAR *window = haystack_end - len_needle; + SHIFT_TYPE *table = pw->table; + LOG("===== Two-way: \"%s\" in \"%s\". =====\n", needle, haystack); + if (mode == FAST_COUNT){ + LOG("###### Counting \"%s\" in \"%s\".\n", needle, haystack); + } + else { + LOG("###### Finding \"%s\" in \"%s\".\n", needle, haystack); + } + Py_ssize_t inv_cut; + Py_ssize_t count = 0; + Py_ssize_t gap = pw->gap; + Py_ssize_t shift, i; + Py_ssize_t iloop = 0; + Py_ssize_t ihits = 0; + Py_ssize_t gap_jump_end = Py_MAX(0, cut - gap); + int is_periodic = pw->is_periodic; + Py_ssize_t memory = 0; + if (is_periodic) { + LOG("Needle is periodic.\n"); + } + else { + LOG("Needle is not periodic.\n"); + period = Py_MAX(gap, period); + } + while (window >= haystack) { + assert(memory == 0); + LOG_LINEUP_REV(); + iloop++; + shift = table[(*window) & TABLE_MASK]; + window -= shift; + if (shift != 0){ + LOG("Horspool skip\n"); + continue; + } + if (window < haystack){ + break; + } + no_shift: + assert((window[0] & TABLE_MASK) == (needle[0] & TABLE_MASK)); + if (is_periodic) { + i = Py_MIN(cut, len_needle - memory); + } else { + i = cut; + } + for (; i >= 0; i--) { + ihits++; + if (needle[i] != window[i]) { + if (i > gap_jump_end) { + LOG("Early left half mismatch: jump by gap.\n"); + assert(gap >= cut - i + 1); + window -= gap; + } + else { + LOG("Late left half mismatch: jump by n (>gap)\n"); + assert(cut - i + 1 > gap); + window -= cut - i + 1; + } + memory = 0; + break; + } + } + if (i != -1){ + continue; + } + i = len_needle - 1; + if (is_periodic) { + i -= memory; + } + for (; i > cut; i--) { + ihits++; + if (needle[i] != window[i]) { + LOG("Right half does not match.\n"); + window -= period; + if (!is_periodic){ + break; + } + memory = len_needle - period; + if (window < haystack) { + break; + } + iloop++; + Py_ssize_t shift = table[(*window) & TABLE_MASK]; + if (!shift){ + goto no_shift; + } + // A mismatch has been identified to the right + // of where i will next start, so we can jump + // at least as far as if the mismatch occurred + // on the first comparison. + // REF> + // 0 m + + // [0 1 2 3] + // m 0 + inv_cut = len_needle - cut - 1; + Py_ssize_t mem_jump = Py_MAX(inv_cut, memory) - inv_cut + 1; + LOG("Skip with Memory.\n"); + memory = 0; + window -= Py_MAX(shift, mem_jump); + break; + } + } + if (i == cut) { + LOG("Found a match!\n"); + if (mode != FAST_COUNT) { + return window - haystack; + } + count++; + if (count == maxcount) { + return maxcount; + } + window -= len_needle; + } + } + // Loop Counter and Memory Access Counter Logging (Used for calibration) + // In worst case scenario iloop == n - m + // iloop == ihits indicates linear performance for quadratic problems + LOG("iloop: %ld\n", iloop); + LOG("ihits: %ld\n", ihits); + if (mode == FAST_COUNT) { + LOG("Counting finished.\n"); + return count; + } + else { + LOG("Not found. Returning -1.\n"); + return -1; + } +} + + +static Py_ssize_t +STRINGLIB(two_way_rfind)(const STRINGLIB_CHAR *haystack, + Py_ssize_t len_haystack, + const STRINGLIB_CHAR *needle, + Py_ssize_t len_needle, + Py_ssize_t maxcount, + int mode) +{ + int dir = -1; + STRINGLIB(prework) pw; + STRINGLIB(_preprocess)(needle, len_needle, &pw, dir, 1, 1); + return STRINGLIB(_two_way_rev)(haystack, len_haystack, maxcount, mode, &pw); +} + + static Py_ssize_t STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, - const STRINGLIB_CHAR* p, Py_ssize_t m, - Py_ssize_t maxcount, int mode, int dynamic) + const STRINGLIB_CHAR* p, Py_ssize_t m, + Py_ssize_t maxcount, int mode, int dynamic) { /* Boyer–Moore–Horspool algorithm with optional dynamic fallback to Two-Way algorithm */ + int dir = 1; + Py_ssize_t shift, i, j; Py_ssize_t mlast = m - 1; const Py_ssize_t width = n - m; - const STRINGLIB_CHAR plast = p[mlast]; - const STRINGLIB_CHAR *ss = s + mlast; // Last window character - STRINGLIB_CHAR slast; + const STRINGLIB_CHAR p_last = p[mlast]; + const STRINGLIB_CHAR *ss = s + mlast; + STRINGLIB_CHAR s_last; // Pre-Work STRINGLIB(prework) pw; - STRINGLIB(_preprocess)(p, m, &pw, 0, 1); + STRINGLIB(_preprocess)(p, m, &pw, dir, 0, 1); Py_ssize_t gap = pw.gap; SHIFT_TYPE *table = pw.table; @@ -566,41 +862,53 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, // And this is not much beneficial for large set(needle) unsigned long mask = 0; Py_ssize_t bloom_gap = 0; + Py_ssize_t j_stop = m; if (m <= 64) { - LOG("Using Bloom skip\n"); + LOG("Initializing Bloom\n"); // Note: bloom_gap("___aa") = 1 + j_stop -= 1; bloom_gap = m; - STRINGLIB_BLOOM_ADD(mask, plast); - for (Py_ssize_t i = 0; i < mlast; i++) { + STRINGLIB_BLOOM_ADD(mask, p_last); + for (i = 0; i < mlast; i++) { STRINGLIB_BLOOM_ADD(mask, p[i]); - if (p[i] == plast) { + if (p[i] == p_last) { bloom_gap = mlast - i; } } } // Horspool Calibration - const float hrs_lcost = 4.0; // average loop cost - const float hrs_hcost = 0.4; // false positive hit cost + const float hrs_lcost = 4.0f; // average loop cost + const float hrs_hcost = 0.4f; // false positive hit cost const int ih_min = 100; // minimum FP hits to fallback // Two-Way Calibration - const float twy_icost = 3.0 * m; // initialization cost - const float twy_lcost = 3.0; // loop cost + const float twy_icost = 3.0f * m; // initialization cost + const float twy_lcost = 3.0f; // loop cost /* Running variables */ - float loops_left, exp_hrs, exp_twy; + float loops_left, total_time_hrs; + float exp_hrs, exp_twy; Py_ssize_t count = 0; - Py_ssize_t shift, i, j; - Py_ssize_t iloop=0, ihits=0; + Py_ssize_t iloop=0; + Py_ssize_t ihits=0; + // TODO> synchronize vvariable names!!! + // const Py_ssize_t len_needle = m; + // const Py_ssize_t len_haystack = n; + // const STRINGLIB_CHAR *const needle = p; + // const STRINGLIB_CHAR *window_last; + // const STRINGLIB_CHAR *const haystack = s; + // const STRINGLIB_CHAR *const haystack_end = haystack + len_haystack; for (i = 0; i <= width;) { iloop++; - slast = ss[i]; + s_last = ss[i]; + // window_last = s_last; + // LOG_LINEUP(); if (bloom_gap) { - if (slast != plast){ + if (s_last != p_last){ if (!STRINGLIB_BLOOM(mask, ss[i+1])){ i += m + 1; LOG("Bloom skip\n"); } else { - shift = table[slast & TABLE_MASK]; + shift = table[s_last & TABLE_MASK]; i += shift; if (shift == 0){ i += 1; @@ -609,27 +917,27 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, } continue; } - assert(slast == plast); + assert(s_last == p_last); } else { - shift = table[slast & TABLE_MASK]; + shift = table[s_last & TABLE_MASK]; i += shift; if (shift != 0){ LOG("Horspool skip\n"); continue; } - assert((ss[i] & TABLE_MASK) == (p[mlast] & TABLE_MASK)); + assert((ss[i] & TABLE_MASK) == (p_last & TABLE_MASK)); } if (i > width){ break; } - for (j = 0; j < m; j++) { + for (j = 0; j < j_stop; j++) { ihits++; if (s[i + j] != p[j]) { break; } } - if (j == m) { + if (j == j_stop) { LOG("Found a match at %ld!\n", i); if (mode != FAST_COUNT) { return i; @@ -655,10 +963,11 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, } if (dynamic && ihits > ih_min) { loops_left = width - i + 1; - exp_hrs = (iloop * hrs_lcost + ihits * hrs_hcost) / i * loops_left; + total_time_hrs = iloop * hrs_lcost + ihits * hrs_hcost; + exp_hrs = total_time_hrs / i * loops_left; exp_twy = twy_icost + loops_left * twy_lcost; if (exp_twy < exp_hrs) { - STRINGLIB(_preprocess)(p, m, &pw, 1, 0); + STRINGLIB(_preprocess)(p, m, &pw, dir, 1, 0); Py_ssize_t res = STRINGLIB(_two_way)( s + i, n - i, maxcount - count, mode, &pw); if (mode == FAST_SEARCH) { @@ -673,8 +982,162 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, // Loop Counter and False Hit Counter Logging // In worst case scenario iloop > n - m. // Used for calibration and fallback decision - LOG("%ld\n", iloop); - LOG("%ld\n", ihits); + LOG("iloop: %ld\n", iloop); + LOG("ihits: %ld\n", ihits); + return mode == FAST_COUNT ? count : -1; +} + + +static Py_ssize_t +STRINGLIB(horspool_rfind)(const STRINGLIB_CHAR* s, Py_ssize_t n, + const STRINGLIB_CHAR* p, Py_ssize_t m, + Py_ssize_t maxcount, int mode, int dynamic) +{ + /* Reverse Boyer–Moore–Horspool algorithm + with optional dynamic fallback to Two-Way algorithm */ + int dir = -1; + Py_ssize_t shift, i, j; + Py_ssize_t mlast = m - 1; + const Py_ssize_t width = n - m; + const STRINGLIB_CHAR p_first = p[0]; + STRINGLIB_CHAR s_first; + + // Pre-Work + STRINGLIB(prework) pw; + STRINGLIB(_preprocess)(p, m, &pw, dir, 0, 1); + Py_ssize_t gap = pw.gap; + SHIFT_TYPE *table = pw.table; + + // Use Bloom for len(needle) <= 64 + // Initialization is costly for long needles + // And this is not much beneficial for large set(needle) + unsigned long mask = 0; + Py_ssize_t bloom_gap = 0; + Py_ssize_t j_stop = -1; + if (m <= 64) { + LOG("Initializing Bloom\n"); + // Note: bloom_gap("___aa") = 1 + j_stop += 1; + bloom_gap = m; + STRINGLIB_BLOOM_ADD(mask, p_first); + for (i = mlast; i > 0; i--) { + STRINGLIB_BLOOM_ADD(mask, p[i]); + if (p[i] == p_first) { + bloom_gap = i; + } + } + + } + // Horspool Calibration + const float hrs_lcost = 4.0f; // average loop cost + const float hrs_hcost = 0.4f; // false positive hit cost + const int ih_min = 100; // minimum FP hits to fallback + // Two-Way Calibration + const float twy_icost = 3.0f * m; // initialization cost + const float twy_lcost = 3.0f; // loop cost + /* Running variables */ + float loops_left, loops_past, total_time_hrs; + float exp_hrs, exp_twy; + Py_ssize_t count = 0; + Py_ssize_t iloop=0; + Py_ssize_t ihits=0; + // TODO> + // const Py_ssize_t len_needle = m; + // const Py_ssize_t len_haystack = n; + // const STRINGLIB_CHAR *const needle = p; + // const STRINGLIB_CHAR *window_last = s + len_needle - 1; + // const STRINGLIB_CHAR *const haystack = s; + // const STRINGLIB_CHAR *const haystack_end = haystack + len_haystack; + // const STRINGLIB_CHAR *window = haystack_end - len_needle; + for (i = width; i >= 0;) { + iloop++; + s_first = s[i]; + // window = s_first; + // LOG_LINEUP_REV(); + if (bloom_gap) { + if (s_first != p_first){ + if (!STRINGLIB_BLOOM(mask, s[i - 1])){ + i -= m + 1; + LOG("Bloom skip\n"); + } + else { + shift = table[s_first & TABLE_MASK]; + i -= shift; + if (shift == 0){ + i -= 1; + } + LOG("Modified Horspool skip\n"); + } + continue; + } + assert(s_first == p_first); + } + else { + shift = table[s_first & TABLE_MASK]; + i -= shift; + if (shift != 0){ + LOG("Horspool skip\n"); + continue; + } + assert((s[i] & TABLE_MASK) == (p_first & TABLE_MASK)); + } + if (i < 0){ + break; + } + for (j = mlast; j > j_stop; j--) { + ihits++; + if (s[i + j] != p[j]) { + break; + } + } + if (j == j_stop) { + LOG("Found a match at %ld!\n", i); + if (mode != FAST_COUNT) { + return i; + } + count++; + if (count == maxcount) { + return maxcount; + } + i -= m; + } + else if (bloom_gap && !STRINGLIB_BLOOM(mask, s[i - 1])) { + LOG("move by (m + 1) = %ld\n", m + 1); + i -= m + 1; + } + else { + if (bloom_gap) { + LOG("move by bloom gap = %ld\n", gap); + i -= bloom_gap; + } else { + LOG("move by gap = %ld\n", gap); + i -= gap; + } + } + if (dynamic && ihits > ih_min) { + loops_past = width - i; + loops_left = i + 1; + total_time_hrs = iloop * hrs_lcost + ihits * hrs_hcost; + exp_hrs = total_time_hrs / loops_past * loops_left; + exp_twy = twy_icost + loops_left * twy_lcost; + if (exp_twy < exp_hrs) { + STRINGLIB(_preprocess)(p, m, &pw, 1, 0, dir); + Py_ssize_t res = STRINGLIB(_two_way_rev)( + s, n - loops_past, maxcount - count, mode, &pw); + if (mode == FAST_SEARCH) { + return res == -1 ? -1 : res; + } + else { + return res + count; + } + } + } + } + // Loop Counter and False Hit Counter Logging + // In worst case scenario iloop > n - m. + // Used for calibration and fallback decision + LOG("iloop: %ld\n", iloop); + LOG("ihits: %ld\n", ihits); return mode == FAST_COUNT ? count : -1; } @@ -926,9 +1389,11 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, } if (mode != FAST_RSEARCH) { return STRINGLIB(horspool_find)(s, n, p, m, maxcount, mode, 1); + // return STRINGLIB(two_way_find)(s, n, p, m, maxcount, mode); } else { /* FAST_RSEARCH */ - return STRINGLIB(default_rfind)(s, n, p, m, maxcount, mode); + return STRINGLIB(horspool_rfind)(s, n, p, m, maxcount, mode, 1); + // return STRINGLIB(two_way_rfind)(s, n, p, m, maxcount, mode); } } From 48e7dd5ed9147b306ed70e2632230a6bdb7e703c Mon Sep 17 00:00:00 2001 From: "d.grigonis" Date: Wed, 5 Jun 2024 02:46:08 +0300 Subject: [PATCH 03/28] bug --- Objects/stringlib/fastsearch.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 409b52ac4a87c1..be3a8ba0bcecee 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -1056,7 +1056,7 @@ STRINGLIB(horspool_rfind)(const STRINGLIB_CHAR* s, Py_ssize_t n, // LOG_LINEUP_REV(); if (bloom_gap) { if (s_first != p_first){ - if (!STRINGLIB_BLOOM(mask, s[i - 1])){ + if (i > 0 && !STRINGLIB_BLOOM(mask, s[i - 1])){ i -= m + 1; LOG("Bloom skip\n"); } @@ -1101,7 +1101,7 @@ STRINGLIB(horspool_rfind)(const STRINGLIB_CHAR* s, Py_ssize_t n, } i -= m; } - else if (bloom_gap && !STRINGLIB_BLOOM(mask, s[i - 1])) { + else if (bloom_gap && i > 0 && !STRINGLIB_BLOOM(mask, s[i - 1])) { LOG("move by (m + 1) = %ld\n", m + 1); i -= m + 1; } From d11f9b9e5aa553abd03baa481dd3f3f667c57a2b Mon Sep 17 00:00:00 2001 From: "d.grigonis" Date: Wed, 5 Jun 2024 22:08:28 +0300 Subject: [PATCH 04/28] bi-directional horspool_find --- Objects/stringlib/fastsearch.h | 784 ++++++++++++--------------------- 1 file changed, 284 insertions(+), 500 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index be3a8ba0bcecee..a0d9395e9404e7 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -210,8 +210,11 @@ STRINGLIB(rfind_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch) Py_LOCAL_INLINE(Py_ssize_t) -STRINGLIB(_lex_search)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle, - Py_ssize_t *return_period, int invert_alphabet) +STRINGLIB(_lex_search)(const STRINGLIB_CHAR *needle, + Py_ssize_t len_needle, + Py_ssize_t *return_period, + int invert_alphabet, + int dir) { /* Do a lexicographic search. Essentially this: >>> max(needle[i:] for i in range(len(needle)+1)) @@ -221,11 +224,12 @@ STRINGLIB(_lex_search)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle, Py_ssize_t k = 0; // The period of the right half. Py_ssize_t period = 1; - + Py_ssize_t stt = dir == 1 ? 0 : len_needle - 1; + STRINGLIB_CHAR a, b; while (candidate + k < len_needle) { // each loop increases candidate + k + max_suffix - STRINGLIB_CHAR a = needle[candidate + k]; - STRINGLIB_CHAR b = needle[max_suffix + k]; + a = needle[stt + dir*(candidate + k)]; + b = needle[stt + dir*(max_suffix + k)]; // check if the suffix at candidate is better than max_suffix if (invert_alphabet ? (b < a) : (a < b)) { // Fell short of max_suffix. @@ -262,63 +266,11 @@ STRINGLIB(_lex_search)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle, } -Py_LOCAL_INLINE(Py_ssize_t) -STRINGLIB(_lex_search_rev)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle, - Py_ssize_t *return_period, int invert_alphabet) -{ - /* Do a lexicographic search. Essentially this: - >>> max(needle[i:] for i in range(len(needle)+1)) - Also find the period of the right half. */ - Py_ssize_t max_suffix = 0; - Py_ssize_t candidate = 1; - Py_ssize_t k = 0; - // The period of the right half. - Py_ssize_t period = 1; - Py_ssize_t offset = len_needle - 1; - while (candidate + k < len_needle) { - // each loop increases candidate + k + max_suffix - STRINGLIB_CHAR a = needle[offset - candidate - k]; - STRINGLIB_CHAR b = needle[offset - max_suffix - k]; - // check if the suffix at candidate is better than max_suffix - if (invert_alphabet ? (b < a) : (a < b)) { - // Fell short of max_suffix. - // The next k + 1 characters are non-increasing - // from candidate, so they won't start a maximal suffix. - candidate += k + 1; - k = 0; - // We've ruled out any period smaller than what's - // been scanned since max_suffix. - period = candidate - max_suffix; - } - else if (a == b) { - if (k + 1 != period) { - // Keep scanning the equal strings - k++; - } - else { - // Matched a whole period. - // Start matching the next period. - candidate += period; - k = 0; - } - } - else { - // Did better than max_suffix, so replace it. - max_suffix = candidate; - candidate++; - k = 0; - period = 1; - } - } - *return_period = period; - return len_needle - max_suffix - 1; -} - - Py_LOCAL_INLINE(Py_ssize_t) STRINGLIB(_factorize)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle, - Py_ssize_t *return_period) + Py_ssize_t *return_period, + int dir) { /* Do a "critical factorization", making it so that: >>> needle = (left := needle[:cut]) + (right := needle[cut:]) @@ -350,11 +302,28 @@ STRINGLIB(_factorize)(const STRINGLIB_CHAR *needle, GC | AGAGAG AGAGAGC = AGAGAGC The length of this minimal repetition is 7, which is indeed the - period of the original string. */ - + period of the original string. + + This is how reverse direction compares to forward: + returned cut is "the size of the cut from the start point". E.g.: + >>> x = '1234' + >>> cut, period = factorize(x, 1) # cut = 0 + >>> cut + 0 + >>> cut_idx = cut + >>> x[:cut_idx], x[cut_idx:] + '', '1234' + >>> x = '4321' + >>> cut, period = factorize(x, -1) + >>> cut + 0 + >>> cut_idx = len(x) - cut + >>> x[:cut_idx], x[cut_idx:] + '4321', '' + */ Py_ssize_t cut1, period1, cut2, period2, cut, period; - cut1 = STRINGLIB(_lex_search)(needle, len_needle, &period1, 0); - cut2 = STRINGLIB(_lex_search)(needle, len_needle, &period2, 1); + cut1 = STRINGLIB(_lex_search)(needle, len_needle, &period1, 0, dir); + cut2 = STRINGLIB(_lex_search)(needle, len_needle, &period2, 1, dir); // Take the later cut. if (cut1 > cut2) { period = period1; @@ -364,40 +333,6 @@ STRINGLIB(_factorize)(const STRINGLIB_CHAR *needle, period = period2; cut = cut2; } - - LOG("split: "); LOG_STRING(needle, cut); - LOG(" + "); LOG_STRING(needle + cut, len_needle - cut); - LOG("\n"); - LOG("Cut: %ld & Period: %ld\n", cut, period); - - *return_period = period; - return cut; -} - - -Py_LOCAL_INLINE(Py_ssize_t) -STRINGLIB(_factorize_rev)(const STRINGLIB_CHAR *needle, - Py_ssize_t len_needle, - Py_ssize_t *return_period) -{ - Py_ssize_t cut1, period1, cut2, period2, cut, period; - cut1 = STRINGLIB(_lex_search_rev)(needle, len_needle, &period1, 0); - cut2 = STRINGLIB(_lex_search_rev)(needle, len_needle, &period2, 1); - // Take the later cut. - if (cut1 < cut2) { - period = period1; - cut = cut1; - } - else { - period = period2; - cut = cut2; - } - - LOG("split: "); LOG_STRING(needle, cut + 1); - LOG(" + "); LOG_STRING(needle + cut + 1, len_needle); - LOG("\n"); - LOG("Cut: %ld & Period: %ld\n", cut, period); - *return_period = period; return cut; } @@ -422,101 +357,78 @@ typedef struct STRINGLIB(_pre) { static void -STRINGLIB(_preprocess)(const STRINGLIB_CHAR *needle, - Py_ssize_t len_needle, - STRINGLIB(prework) *pw, - int direction, - int critical_fac, - int bc_table_gs_gap) +STRINGLIB(_init_bc_table_gs_gap)(STRINGLIB(prework) *pw, int dir) { - // Set the Needle & Calculate Critical Factorization - if (critical_fac) { - pw->needle = needle; - pw->len_needle = len_needle; - if (direction == 1) { - pw->cut = STRINGLIB(_factorize)(needle, len_needle, &(pw->period)); - assert(pw->period + pw->cut <= len_needle); - //bbb c - // | - //bbb - // bb c - // - //c bbb - // | - pw->is_periodic = (0 == memcmp(needle, - needle + pw->period, - pw->cut * STRINGLIB_SIZEOF_CHAR)); - if (pw->is_periodic) { - assert(pw->cut <= len_needle/2); - assert(pw->cut < pw->period); - } - else { - // A lower bound on the period - pw->period = Py_MAX(pw->cut, len_needle - pw->cut) + 1; - } - } - else { - // [0, 1, 2, 3] - pw->cut = STRINGLIB(_factorize_rev)( - needle, len_needle, &(pw->period)); - Py_ssize_t inv_cut = len_needle - pw->cut - 1; - assert(pw->cut - pw->period >= 0); - pw->is_periodic = (0 == memcmp(needle + len_needle - inv_cut, - needle + len_needle - inv_cut - pw->period, - inv_cut * STRINGLIB_SIZEOF_CHAR)); - if (pw->is_periodic) { - assert(pw->cut >= len_needle/2); - assert(len_needle - pw->cut - 1 < pw->period); - } - else { - // A lower bound on the period - pw->period = Py_MIN(pw->cut, len_needle + pw->cut) + 1; - } - } + // 1. Fill up a compressed Boyer-Moore "Bad Character" table + const STRINGLIB_CHAR *needle = pw->needle; + Py_ssize_t len_needle = pw->len_needle; + Py_ssize_t stt = dir == 1 ? 0 : len_needle - 1; + Py_ssize_t end = dir == 1 ? len_needle - 1 : 0; + Py_ssize_t not_found_shift = Py_MIN(len_needle, MAX_SHIFT); + for (Py_ssize_t i = 0; i < (Py_ssize_t)TABLE_SIZE; i++) { + pw->table[i] = Py_SAFE_DOWNCAST(not_found_shift, + Py_ssize_t, SHIFT_TYPE); } - if (bc_table_gs_gap) { - // Fill up a compressed Boyer-Moore "Bad Character" table - Py_ssize_t not_found_shift = Py_MIN(len_needle, MAX_SHIFT); - for (Py_ssize_t i = 0; i < (Py_ssize_t)TABLE_SIZE; i++) { - pw->table[i] = Py_SAFE_DOWNCAST(not_found_shift, + for (Py_ssize_t i = len_needle - not_found_shift; i < len_needle; i++) { + SHIFT_TYPE shift = Py_SAFE_DOWNCAST(len_needle - 1 - i, Py_ssize_t, SHIFT_TYPE); + pw->table[needle[stt + dir*i] & TABLE_MASK] = shift; + } + // 2. Initialize "Good Suffix" Last Character Gap + // Note: gap("___aa") = 1 + pw->gap = len_needle; + STRINGLIB_CHAR last = needle[end] & TABLE_MASK; + for (Py_ssize_t i = 1; i < len_needle; i++) { + STRINGLIB_CHAR x = needle[end - dir*i] & TABLE_MASK; + if (x == last) { + pw->gap = i; + break; } - if (direction == 1) { - for (Py_ssize_t i = len_needle - not_found_shift; i < len_needle; i++) { - SHIFT_TYPE shift = Py_SAFE_DOWNCAST(len_needle - 1 - i, - Py_ssize_t, SHIFT_TYPE); - pw->table[needle[i] & TABLE_MASK] = shift; - } + } + LOG("Good Suffix Gap: %ld\n", pw->gap); +} - // Initialize "Good Suffix" Last Character Gap - // Note: gap("___aa") = 1 - pw->gap = len_needle; - STRINGLIB_CHAR last = needle[len_needle - 1] & TABLE_MASK; - for (Py_ssize_t i = len_needle - 2; i >= 0; i--) { - STRINGLIB_CHAR x = needle[i] & TABLE_MASK; - if (x == last) { - pw->gap = len_needle - 1 - i; - break; - } - } - } - else { - for (Py_ssize_t i = not_found_shift - 1; i >= 0; i--) { - SHIFT_TYPE shift = Py_SAFE_DOWNCAST(i, Py_ssize_t, SHIFT_TYPE); - pw->table[needle[i] & TABLE_MASK] = shift; - } - pw->gap = len_needle; - STRINGLIB_CHAR last = needle[0] & TABLE_MASK; - for (Py_ssize_t i = 1; i < len_needle; i++) { - STRINGLIB_CHAR x = needle[i] & TABLE_MASK; - if (x == last) { - pw->gap = i; - break; - } - } - } - LOG("Good Suffix Gap: %ld\n", pw->gap); + +static void +STRINGLIB(_init_critical_fac)(STRINGLIB(prework) *pw, int dir) +{ + // Calculate Critical Factorization + const STRINGLIB_CHAR *needle = pw->needle; + Py_ssize_t len_needle = pw->len_needle; + Py_ssize_t cut, period; + int is_periodic; + cut = STRINGLIB(_factorize)(needle, len_needle, &period, dir); + assert(cut + period <= len_needle); + if (dir == 1) { + is_periodic = (0 == memcmp(needle, + needle + period, + cut * STRINGLIB_SIZEOF_CHAR)); + } + else { + Py_ssize_t cut_idx = len_needle - cut; + is_periodic = (0 == memcmp(needle + cut_idx, + needle + cut_idx - period, + cut * STRINGLIB_SIZEOF_CHAR)); } + if (is_periodic) { + assert(cut <= len_needle/2); + assert(cut < period); + } + else { + // A lower bound on the period + // CLARIFY> An upper bound? + period = Py_MAX(cut, len_needle - cut) + 1; + } + pw->cut = cut; + pw->period = period; + pw->is_periodic = is_periodic; + + LOG("Cut: %ld & Period: %ld\n", cut, period); + LOG("split: "); + LOG_STRING(needle, cut); + LOG(" + "); + LOG_STRING(needle + cut, len_needle - cut); + LOG("\n"); } @@ -525,18 +437,24 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, Py_ssize_t maxcount, int mode, - STRINGLIB(prework) *pw) + STRINGLIB(prework) *pw)//, + // int reversed) { // Crochemore and Perrin's (1991) Two-Way algorithm. // See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 + // Needle const Py_ssize_t len_needle = pw->len_needle; + const STRINGLIB_CHAR *const needle = pw->needle; + // Cut & Period const Py_ssize_t cut = pw->cut; + // const Py_ssize_t cut_idx = reversed ? len_needle - cut : cut; Py_ssize_t period = pw->period; - const STRINGLIB_CHAR *const needle = pw->needle; + // Windows const STRINGLIB_CHAR *window_last = haystack + len_needle - 1; const STRINGLIB_CHAR *const haystack_end = haystack + len_haystack; SHIFT_TYPE *table = pw->table; const STRINGLIB_CHAR *window; + // Log LOG("===== Two-way: \"%s\" in \"%s\". =====\n", needle, haystack); if (mode == FAST_COUNT){ LOG("###### Counting \"%s\" in \"%s\".\n", needle, haystack); @@ -544,6 +462,7 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, else { LOG("###### Finding \"%s\" in \"%s\".\n", needle, haystack); } + // Prepare Py_ssize_t count = 0; Py_ssize_t gap = pw->gap; Py_ssize_t shift, i; @@ -569,10 +488,10 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, LOG("Horspool skip\n"); continue; } + no_shift: if (window_last >= haystack_end){ break; } - no_shift: window = window_last - len_needle + 1; assert((window[len_needle - 1] & TABLE_MASK) == (needle[len_needle - 1] & TABLE_MASK)); @@ -602,36 +521,39 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, continue; } if (is_periodic) { - i = memory; - } else { - i = 0; - } - for (; i < cut; i++) { - ihits++; - if (needle[i] != window[i]) { - LOG("Left half does not match.\n"); - window_last += period; - if (!is_periodic){ - break; + for (i = memory; i < cut; i++) { + ihits++; + if (needle[i] != window[i]) { + LOG("Left half does not match.\n"); + window_last += period; + memory = len_needle - period; + if (window_last >= haystack_end) { + break; + } + iloop++; + Py_ssize_t shift = table[(*window_last) & TABLE_MASK]; + if (shift != 0){ + // A mismatch has been identified to the right + // of where i will next start, so we can jump + // at least as far as if the mismatch occurred + // on the first comparison. + Py_ssize_t mem_jump = Py_MAX(cut, memory) - cut + 1; + LOG("Skip with Memory.\n"); + memory = 0; + window_last += Py_MAX(shift, mem_jump); + break; + } + goto no_shift; } - memory = len_needle - period; - if (window_last >= haystack_end) { + } + } else { + for (i = 0; i < cut; i++) { + ihits++; + if (needle[i] != window[i]) { + LOG("Left half does not match.\n"); + window_last += period; break; } - iloop++; - Py_ssize_t shift = table[(*window_last) & TABLE_MASK]; - if (!shift){ - goto no_shift; - } - // A mismatch has been identified to the right - // of where i will next start, so we can jump - // at least as far as if the mismatch occurred - // on the first comparison. - Py_ssize_t mem_jump = Py_MAX(cut, memory) - cut + 1; - LOG("Skip with Memory.\n"); - memory = 0; - window_last += Py_MAX(shift, mem_jump); - break; } } if (i == cut) { @@ -662,21 +584,6 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, } -static Py_ssize_t -STRINGLIB(two_way_find)(const STRINGLIB_CHAR *haystack, - Py_ssize_t len_haystack, - const STRINGLIB_CHAR *needle, - Py_ssize_t len_needle, - Py_ssize_t maxcount, - int mode) -{ - int dir = 1; - STRINGLIB(prework) pw; - STRINGLIB(_preprocess)(needle, len_needle, &pw, dir, 1, 1); - return STRINGLIB(_two_way)(haystack, len_haystack, maxcount, mode, &pw); -} - - static Py_ssize_t STRINGLIB(_two_way_rev)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, @@ -687,7 +594,10 @@ STRINGLIB(_two_way_rev)(const STRINGLIB_CHAR *haystack, // Reversed Crochemore and Perrin's (1991) Two-Way algorithm. // See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 const Py_ssize_t len_needle = pw->len_needle; - const Py_ssize_t cut = pw->cut; + const Py_ssize_t cut = pw->cut - 1; + // TODO> + const Py_ssize_t left_len = len_needle - pw->cut; + // TODO> Py_ssize_t period = pw->period; const STRINGLIB_CHAR *const needle = pw->needle; const STRINGLIB_CHAR *const haystack_end = haystack + len_haystack; @@ -726,10 +636,10 @@ STRINGLIB(_two_way_rev)(const STRINGLIB_CHAR *haystack, LOG("Horspool skip\n"); continue; } + no_shift: if (window < haystack){ break; } - no_shift: assert((window[0] & TABLE_MASK) == (needle[0] & TABLE_MASK)); if (is_periodic) { i = Py_MIN(cut, len_needle - memory); @@ -756,41 +666,47 @@ STRINGLIB(_two_way_rev)(const STRINGLIB_CHAR *haystack, if (i != -1){ continue; } - i = len_needle - 1; + if (is_periodic) { - i -= memory; - } - for (; i > cut; i--) { - ihits++; - if (needle[i] != window[i]) { - LOG("Right half does not match.\n"); - window -= period; - if (!is_periodic){ - break; + for (i = len_needle - 1 - memory; i > cut; i--) { + ihits++; + if (needle[i] != window[i]) { + LOG("Right half does not match.\n"); + window -= period; + memory = len_needle - period; + if (window < haystack) { + break; + } + iloop++; + Py_ssize_t shift = table[(*window) & TABLE_MASK]; + if (shift != 0){ + // A mismatch has been identified to the right + // of where i will next start, so we can jump + // at least as far as if the mismatch occurred + // on the first comparison. + // REF> + // 0 m + + // [0 1 2 3] + // m 0 + inv_cut = len_needle - cut - 1; + Py_ssize_t mem_jump = Py_MAX(inv_cut, memory) - inv_cut + 1; + LOG("Skip with Memory.\n"); + memory = 0; + window -= Py_MAX(shift, mem_jump); + break; + } + goto no_shift; } - memory = len_needle - period; - if (window < haystack) { + } + } + else { + for (i = len_needle - 1; i > cut; i--) { + ihits++; + if (needle[i] != window[i]) { + LOG("Right half does not match.\n"); + window -= period; break; } - iloop++; - Py_ssize_t shift = table[(*window) & TABLE_MASK]; - if (!shift){ - goto no_shift; - } - // A mismatch has been identified to the right - // of where i will next start, so we can jump - // at least as far as if the mismatch occurred - // on the first comparison. - // REF> - // 0 m + - // [0 1 2 3] - // m 0 - inv_cut = len_needle - cut - 1; - Py_ssize_t mem_jump = Py_MAX(inv_cut, memory) - inv_cut + 1; - LOG("Skip with Memory.\n"); - memory = 0; - window -= Py_MAX(shift, mem_jump); - break; } } if (i == cut) { @@ -822,125 +738,141 @@ STRINGLIB(_two_way_rev)(const STRINGLIB_CHAR *haystack, static Py_ssize_t -STRINGLIB(two_way_rfind)(const STRINGLIB_CHAR *haystack, - Py_ssize_t len_haystack, - const STRINGLIB_CHAR *needle, - Py_ssize_t len_needle, - Py_ssize_t maxcount, - int mode) +STRINGLIB(two_way_find)(const STRINGLIB_CHAR *haystack, + Py_ssize_t len_haystack, + const STRINGLIB_CHAR *needle, + Py_ssize_t len_needle, + Py_ssize_t maxcount, + int mode, int dir) { - int dir = -1; STRINGLIB(prework) pw; - STRINGLIB(_preprocess)(needle, len_needle, &pw, dir, 1, 1); - return STRINGLIB(_two_way_rev)(haystack, len_haystack, maxcount, mode, &pw); + (&pw)->needle = needle; + (&pw)->len_needle = len_needle; + STRINGLIB(_init_bc_table_gs_gap)(&pw, dir); + STRINGLIB(_init_critical_fac)(&pw, dir); + if (dir == 1) { + return STRINGLIB(_two_way)(haystack, len_haystack, maxcount, mode, &pw); + } + else { + return STRINGLIB(_two_way_rev)(haystack, len_haystack, maxcount, mode, &pw); + } + } static Py_ssize_t STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, const STRINGLIB_CHAR* p, Py_ssize_t m, - Py_ssize_t maxcount, int mode, int dynamic) + Py_ssize_t maxcount, int mode, + int dir, int dynamic) { /* Boyer–Moore–Horspool algorithm with optional dynamic fallback to Two-Way algorithm */ - int dir = 1; - Py_ssize_t shift, i, j; - Py_ssize_t mlast = m - 1; - const Py_ssize_t width = n - m; - const STRINGLIB_CHAR p_last = p[mlast]; - const STRINGLIB_CHAR *ss = s + mlast; - STRINGLIB_CHAR s_last; - - // Pre-Work STRINGLIB(prework) pw; - STRINGLIB(_preprocess)(p, m, &pw, dir, 0, 1); - Py_ssize_t gap = pw.gap; - SHIFT_TYPE *table = pw.table; + (&pw)->needle = p; + (&pw)->len_needle = m; + STRINGLIB(_init_bc_table_gs_gap)(&pw, dir); + Py_ssize_t gap = (&pw)->gap; + SHIFT_TYPE *table = (&pw)->table; + + // Direction Agnostic Constants + const Py_ssize_t m_m1 = m - 1; + const Py_ssize_t m_p1 = m + 1; + const Py_ssize_t w = n - m; + + // Bi-Directional Constants + const Py_ssize_t s_stt = dir == 1 ? 0 : n - 1; + const Py_ssize_t p_stt = dir == 1 ? 0 : m - 1; + const Py_ssize_t p_end = dir == 1 ? m - 1 : 0; + const Py_ssize_t dir_m_m1 = dir * m_m1; + + // String Pointers + const STRINGLIB_CHAR *const p_last = p + p_end; + const STRINGLIB_CHAR *const s_plast = s + s_stt + dir_m_m1; + const STRINGLIB_CHAR *win, *win_last;; + + // Temporary Variables + Py_ssize_t shift, i, ip, j, j_stop, idx; // Use Bloom for len(needle) <= 64 // Initialization is costly for long needles // And this is not much beneficial for large set(needle) unsigned long mask = 0; - Py_ssize_t bloom_gap = 0; - Py_ssize_t j_stop = m; + Py_ssize_t true_gap = 0; if (m <= 64) { - LOG("Initializing Bloom\n"); - // Note: bloom_gap("___aa") = 1 - j_stop -= 1; - bloom_gap = m; - STRINGLIB_BLOOM_ADD(mask, p_last); - for (i = 0; i < mlast; i++) { - STRINGLIB_BLOOM_ADD(mask, p[i]); - if (p[i] == p_last) { - bloom_gap = mlast - i; + true_gap = m; + // Note: true_gap("___aa") = 1 + STRINGLIB_BLOOM_ADD(mask, *p_last); + for (j = 1; j < m; j++) { + ip = p_end - dir * j; + STRINGLIB_BLOOM_ADD(mask, p[ip]); + if (true_gap == m && p[ip] == *p_last) { + true_gap = j; } } + LOG("Good Suffix True Gap: %ld\n", true_gap); } // Horspool Calibration - const float hrs_lcost = 4.0f; // average loop cost - const float hrs_hcost = 0.4f; // false positive hit cost + const double hrs_lcost = 4.0; // average loop cost + const double hrs_hcost = 0.4; // false positive hit cost const int ih_min = 100; // minimum FP hits to fallback // Two-Way Calibration - const float twy_icost = 3.0f * m; // initialization cost - const float twy_lcost = 3.0f; // loop cost + const double twy_icost = 3.0 * m; // initialization cost + const double twy_lcost = 3.0; // loop cost /* Running variables */ - float loops_left, total_time_hrs; - float exp_hrs, exp_twy; + double total_time_hrs, exp_hrs, exp_twy; + Py_ssize_t loops_left; Py_ssize_t count = 0; Py_ssize_t iloop=0; Py_ssize_t ihits=0; - // TODO> synchronize vvariable names!!! - // const Py_ssize_t len_needle = m; - // const Py_ssize_t len_haystack = n; - // const STRINGLIB_CHAR *const needle = p; - // const STRINGLIB_CHAR *window_last; - // const STRINGLIB_CHAR *const haystack = s; - // const STRINGLIB_CHAR *const haystack_end = haystack + len_haystack; - for (i = 0; i <= width;) { + for (i = 0; i <= w;) { iloop++; - s_last = ss[i]; - // window_last = s_last; - // LOG_LINEUP(); - if (bloom_gap) { - if (s_last != p_last){ - if (!STRINGLIB_BLOOM(mask, ss[i+1])){ - i += m + 1; + win_last = s_plast + dir * i; + if (true_gap) { + if (*win_last != *p_last){ + if (i < w && !STRINGLIB_BLOOM(mask, win_last[dir])){ + i += m_p1; LOG("Bloom skip\n"); } else { - shift = table[s_last & TABLE_MASK]; - i += shift; - if (shift == 0){ + shift = table[*win_last & TABLE_MASK]; + if (shift != 0){ + i += shift; + LOG("Horspool skip\n"); + } + else { i += 1; } - LOG("Modified Horspool skip\n"); } continue; } - assert(s_last == p_last); + assert(*win_last == *p_last); + j_stop = m_m1; } else { - shift = table[s_last & TABLE_MASK]; - i += shift; + shift = table[*win_last & TABLE_MASK]; if (shift != 0){ + i += shift; LOG("Horspool skip\n"); continue; } - assert((ss[i] & TABLE_MASK) == (p_last & TABLE_MASK)); - } - if (i > width){ - break; + assert((*win_last & TABLE_MASK) == (*p_last & TABLE_MASK)); + j_stop = m; } - for (j = 0; j < j_stop; j++) { + win = win_last - p_end; + j = 0; + for (; j < j_stop; j++) { ihits++; - if (s[i + j] != p[j]) { + ip = p_stt + dir * j; + if (win[ip] != p[ip]) { break; } } if (j == j_stop) { - LOG("Found a match at %ld!\n", i); + idx = dir == 1 ? i : n - m - i; + LOG("Found a match at %ld!\n", idx); if (mode != FAST_COUNT) { - return i; + return idx; } count++; if (count == maxcount) { @@ -948,184 +880,36 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, } i += m; } - else if (bloom_gap && !STRINGLIB_BLOOM(mask, ss[i+1])) { - LOG("move by (m + 1) = %ld\n", m + 1); - i += m + 1; + else if (true_gap && i < w && !STRINGLIB_BLOOM(mask, win_last[dir])) { + LOG("move by (m + 1) = %ld\n", m_p1); + i += m_p1; } else { - if (bloom_gap) { + if (true_gap) { LOG("move by bloom gap = %ld\n", gap); - i += bloom_gap; + i += true_gap; } else { LOG("move by gap = %ld\n", gap); i += gap; } } if (dynamic && ihits > ih_min) { - loops_left = width - i + 1; + loops_left = w - i + 1; total_time_hrs = iloop * hrs_lcost + ihits * hrs_hcost; exp_hrs = total_time_hrs / i * loops_left; exp_twy = twy_icost + loops_left * twy_lcost; if (exp_twy < exp_hrs) { - STRINGLIB(_preprocess)(p, m, &pw, dir, 1, 0); - Py_ssize_t res = STRINGLIB(_two_way)( - s + i, n - i, maxcount - count, mode, &pw); - if (mode == FAST_SEARCH) { - return res == -1 ? -1 : res + i; - } - else { - return res + count; - } - } - } - } - // Loop Counter and False Hit Counter Logging - // In worst case scenario iloop > n - m. - // Used for calibration and fallback decision - LOG("iloop: %ld\n", iloop); - LOG("ihits: %ld\n", ihits); - return mode == FAST_COUNT ? count : -1; -} - - -static Py_ssize_t -STRINGLIB(horspool_rfind)(const STRINGLIB_CHAR* s, Py_ssize_t n, - const STRINGLIB_CHAR* p, Py_ssize_t m, - Py_ssize_t maxcount, int mode, int dynamic) -{ - /* Reverse Boyer–Moore–Horspool algorithm - with optional dynamic fallback to Two-Way algorithm */ - int dir = -1; - Py_ssize_t shift, i, j; - Py_ssize_t mlast = m - 1; - const Py_ssize_t width = n - m; - const STRINGLIB_CHAR p_first = p[0]; - STRINGLIB_CHAR s_first; - - // Pre-Work - STRINGLIB(prework) pw; - STRINGLIB(_preprocess)(p, m, &pw, dir, 0, 1); - Py_ssize_t gap = pw.gap; - SHIFT_TYPE *table = pw.table; - - // Use Bloom for len(needle) <= 64 - // Initialization is costly for long needles - // And this is not much beneficial for large set(needle) - unsigned long mask = 0; - Py_ssize_t bloom_gap = 0; - Py_ssize_t j_stop = -1; - if (m <= 64) { - LOG("Initializing Bloom\n"); - // Note: bloom_gap("___aa") = 1 - j_stop += 1; - bloom_gap = m; - STRINGLIB_BLOOM_ADD(mask, p_first); - for (i = mlast; i > 0; i--) { - STRINGLIB_BLOOM_ADD(mask, p[i]); - if (p[i] == p_first) { - bloom_gap = i; - } - } - - } - // Horspool Calibration - const float hrs_lcost = 4.0f; // average loop cost - const float hrs_hcost = 0.4f; // false positive hit cost - const int ih_min = 100; // minimum FP hits to fallback - // Two-Way Calibration - const float twy_icost = 3.0f * m; // initialization cost - const float twy_lcost = 3.0f; // loop cost - /* Running variables */ - float loops_left, loops_past, total_time_hrs; - float exp_hrs, exp_twy; - Py_ssize_t count = 0; - Py_ssize_t iloop=0; - Py_ssize_t ihits=0; - // TODO> - // const Py_ssize_t len_needle = m; - // const Py_ssize_t len_haystack = n; - // const STRINGLIB_CHAR *const needle = p; - // const STRINGLIB_CHAR *window_last = s + len_needle - 1; - // const STRINGLIB_CHAR *const haystack = s; - // const STRINGLIB_CHAR *const haystack_end = haystack + len_haystack; - // const STRINGLIB_CHAR *window = haystack_end - len_needle; - for (i = width; i >= 0;) { - iloop++; - s_first = s[i]; - // window = s_first; - // LOG_LINEUP_REV(); - if (bloom_gap) { - if (s_first != p_first){ - if (i > 0 && !STRINGLIB_BLOOM(mask, s[i - 1])){ - i -= m + 1; - LOG("Bloom skip\n"); - } - else { - shift = table[s_first & TABLE_MASK]; - i -= shift; - if (shift == 0){ - i -= 1; - } - LOG("Modified Horspool skip\n"); + STRINGLIB(_init_critical_fac)(&pw, dir); + Py_ssize_t res; + if (dir == 1) { + res = STRINGLIB(_two_way)( + s + i, n - i, maxcount - count, mode, &pw); + } else { + res = STRINGLIB(_two_way_rev)( + s, n - i, maxcount - count, mode, &pw); } - continue; - } - assert(s_first == p_first); - } - else { - shift = table[s_first & TABLE_MASK]; - i -= shift; - if (shift != 0){ - LOG("Horspool skip\n"); - continue; - } - assert((s[i] & TABLE_MASK) == (p_first & TABLE_MASK)); - } - if (i < 0){ - break; - } - for (j = mlast; j > j_stop; j--) { - ihits++; - if (s[i + j] != p[j]) { - break; - } - } - if (j == j_stop) { - LOG("Found a match at %ld!\n", i); - if (mode != FAST_COUNT) { - return i; - } - count++; - if (count == maxcount) { - return maxcount; - } - i -= m; - } - else if (bloom_gap && i > 0 && !STRINGLIB_BLOOM(mask, s[i - 1])) { - LOG("move by (m + 1) = %ld\n", m + 1); - i -= m + 1; - } - else { - if (bloom_gap) { - LOG("move by bloom gap = %ld\n", gap); - i -= bloom_gap; - } else { - LOG("move by gap = %ld\n", gap); - i -= gap; - } - } - if (dynamic && ihits > ih_min) { - loops_past = width - i; - loops_left = i + 1; - total_time_hrs = iloop * hrs_lcost + ihits * hrs_hcost; - exp_hrs = total_time_hrs / loops_past * loops_left; - exp_twy = twy_icost + loops_left * twy_lcost; - if (exp_twy < exp_hrs) { - STRINGLIB(_preprocess)(p, m, &pw, 1, 0, dir); - Py_ssize_t res = STRINGLIB(_two_way_rev)( - s, n - loops_past, maxcount - count, mode, &pw); if (mode == FAST_SEARCH) { - return res == -1 ? -1 : res; + return res == -1 ? -1 : (dir == 1 ? res + i : res); } else { return res + count; @@ -1134,7 +918,7 @@ STRINGLIB(horspool_rfind)(const STRINGLIB_CHAR* s, Py_ssize_t n, } } // Loop Counter and False Hit Counter Logging - // In worst case scenario iloop > n - m. + // In worst case scenario: ihits == (n - m) * m // Used for calibration and fallback decision LOG("iloop: %ld\n", iloop); LOG("ihits: %ld\n", ihits); @@ -1262,7 +1046,7 @@ STRINGLIB(adaptive_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, hits += j + 1; if (hits > m / 4 && w - i > 2000) { res = STRINGLIB(two_way_find)( - s + i, n - i, p, m, maxcount, mode); + s + i, n - i, p, m, maxcount, mode, 1); if (mode == FAST_SEARCH) { return res == -1 ? -1 : res + i; } @@ -1388,12 +1172,12 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, } } if (mode != FAST_RSEARCH) { - return STRINGLIB(horspool_find)(s, n, p, m, maxcount, mode, 1); + return STRINGLIB(horspool_find)(s, n, p, m, maxcount, mode, 1, 1); // return STRINGLIB(two_way_find)(s, n, p, m, maxcount, mode); } else { /* FAST_RSEARCH */ - return STRINGLIB(horspool_rfind)(s, n, p, m, maxcount, mode, 1); + return STRINGLIB(horspool_find)(s, n, p, m, maxcount, mode, -1, 1); // return STRINGLIB(two_way_rfind)(s, n, p, m, maxcount, mode); } } From 6c9dbc3206f613808b10088b66577282604cea24 Mon Sep 17 00:00:00 2001 From: "d.grigonis" Date: Thu, 6 Jun 2024 03:26:18 +0300 Subject: [PATCH 05/28] optimized horspool --- Objects/stringlib/fastsearch.h | 171 +++++++++++++++++---------------- 1 file changed, 89 insertions(+), 82 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index a0d9395e9404e7..9ec7e6c47d792f 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -764,10 +764,19 @@ static Py_ssize_t STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, const STRINGLIB_CHAR* p, Py_ssize_t m, Py_ssize_t maxcount, int mode, - int dir, int dynamic) + int direction, int dynamic) { /* Boyer–Moore–Horspool algorithm with optional dynamic fallback to Two-Way algorithm */ + if (mode == FAST_COUNT){ + LOG("Horspool Counting \"%s\" in \"%s\".\n", p, s); + } + else { + LOG("Horspool Finding \"%s\" in \"%s\".\n", p, s); + } + int dir = direction < 0 ? -1 : 1; + int reversed = dir < 0; + STRINGLIB(prework) pw; (&pw)->needle = p; (&pw)->len_needle = m; @@ -775,146 +784,143 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, Py_ssize_t gap = (&pw)->gap; SHIFT_TYPE *table = (&pw)->table; - // Direction Agnostic Constants + // Direction Agnostic const Py_ssize_t m_m1 = m - 1; const Py_ssize_t m_p1 = m + 1; const Py_ssize_t w = n - m; - // Bi-Directional Constants + // Direction Sensitive const Py_ssize_t s_stt = dir == 1 ? 0 : n - 1; const Py_ssize_t p_stt = dir == 1 ? 0 : m - 1; const Py_ssize_t p_end = dir == 1 ? m - 1 : 0; - const Py_ssize_t dir_m_m1 = dir * m_m1; - - // String Pointers - const STRINGLIB_CHAR *const p_last = p + p_end; - const STRINGLIB_CHAR *const s_plast = s + s_stt + dir_m_m1; - const STRINGLIB_CHAR *win, *win_last;; + const Py_ssize_t dir_m_m1 = reversed ? -m_m1 : m_m1; + const STRINGLIB_CHAR *const ss = s + s_stt + dir_m_m1; + const STRINGLIB_CHAR p_last = p[p_end]; // Temporary Variables - Py_ssize_t shift, i, ip, j, j_stop, idx; + Py_ssize_t shift, i, ip, jp, j, j_off; + STRINGLIB_CHAR s_last; - // Use Bloom for len(needle) <= 64 - // Initialization is costly for long needles - // And this is not much beneficial for large set(needle) + // Use Bloom for len(haystack) >= 10 * len(needle) unsigned long mask = 0; Py_ssize_t true_gap = 0; - if (m <= 64) { + Py_ssize_t j_stop = m; + if (n >= 10 * m) { + j_stop = m_m1; true_gap = m; // Note: true_gap("___aa") = 1 - STRINGLIB_BLOOM_ADD(mask, *p_last); + STRINGLIB_CHAR p_tmp; + STRINGLIB_BLOOM_ADD(mask, p_last); for (j = 1; j < m; j++) { - ip = p_end - dir * j; - STRINGLIB_BLOOM_ADD(mask, p[ip]); - if (true_gap == m && p[ip] == *p_last) { + jp = p_end + (reversed ? j : -j); + p_tmp = p[jp]; + STRINGLIB_BLOOM_ADD(mask, p_tmp); + if (true_gap == m && p_tmp == p_last) { true_gap = j; } } LOG("Good Suffix True Gap: %ld\n", true_gap); } + + // Total cost of two-way initialization // Horspool Calibration - const double hrs_lcost = 4.0; // average loop cost - const double hrs_hcost = 0.4; // false positive hit cost - const int ih_min = 100; // minimum FP hits to fallback + const double hrs_lcost = 4.0; // average loop cost + const double hrs_hcost = 0.4; // false positive hit cost // Two-Way Calibration - const double twy_icost = 3.0 * m; // initialization cost - const double twy_lcost = 3.0; // loop cost - /* Running variables */ - double total_time_hrs, exp_hrs, exp_twy; - Py_ssize_t loops_left; + const double twy_icost = 3.0 * (double)m; // total initialization cost + const double twy_lcost = 3.0; // loop cost + // Running variables + double exp_hrs, exp_twy, ll; // expected run times & loops left Py_ssize_t count = 0; - Py_ssize_t iloop=0; - Py_ssize_t ihits=0; + Py_ssize_t iloop = 0, ihits_last = 0; + Py_ssize_t ihits = 0, iloop_last = 0; for (i = 0; i <= w;) { iloop++; - win_last = s_plast + dir * i; + ip = reversed ? -i : i; + s_last = ss[ip]; + LOG("s_last: %c\n", s_last); if (true_gap) { - if (*win_last != *p_last){ - if (i < w && !STRINGLIB_BLOOM(mask, win_last[dir])){ - i += m_p1; - LOG("Bloom skip\n"); + shift = 0; + if (s_last != p_last) { + if (i < w && !STRINGLIB_BLOOM(mask, ss[ip+dir])){ + shift = m_p1; } else { - shift = table[*win_last & TABLE_MASK]; - if (shift != 0){ - i += shift; - LOG("Horspool skip\n"); - } - else { - i += 1; - } + shift = Py_MAX(table[s_last & TABLE_MASK], 1); } - continue; } - assert(*win_last == *p_last); - j_stop = m_m1; + assert(s_last == p_last); } else { - shift = table[*win_last & TABLE_MASK]; - if (shift != 0){ - i += shift; - LOG("Horspool skip\n"); - continue; - } - assert((*win_last & TABLE_MASK) == (*p_last & TABLE_MASK)); - j_stop = m; + shift = table[s_last & TABLE_MASK]; + assert((s_last & TABLE_MASK) == (p_last & TABLE_MASK)); + } + if (shift != 0) { + LOG("Shift %ld\n", shift); + i += shift; + continue; } - win = win_last - p_end; - j = 0; - for (; j < j_stop; j++) { + j_off = ip - p_end; + for (j = 0; j < j_stop; j++) { ihits++; - ip = p_stt + dir * j; - if (win[ip] != p[ip]) { + LOG("a: %c\n", ss[j_off + jp]); + LOG("b: %c\n", p[jp]); + jp = p_stt + (reversed ? -j : j); + if (ss[j_off + jp] != p[jp]) { break; } } if (j == j_stop) { - idx = dir == 1 ? i : n - m - i; - LOG("Found a match at %ld!\n", idx); + LOG("Found a match!\n"); if (mode != FAST_COUNT) { - return idx; + return reversed ? n - m - i : i; } - count++; - if (count == maxcount) { + if (++count == maxcount) { return maxcount; } i += m; } - else if (true_gap && i < w && !STRINGLIB_BLOOM(mask, win_last[dir])) { - LOG("move by (m + 1) = %ld\n", m_p1); - i += m_p1; - } - else { - if (true_gap) { - LOG("move by bloom gap = %ld\n", gap); + else if (true_gap) { + if (i < w && !STRINGLIB_BLOOM(mask, ss[ip+dir])) { + LOG("Move by (m + 1) = %ld\n", m_p1); + i += m_p1; + } + else { + LOG("Move by true gap = %ld\n", gap); i += true_gap; - } else { - LOG("move by gap = %ld\n", gap); - i += gap; } } - if (dynamic && ihits > ih_min) { - loops_left = w - i + 1; - total_time_hrs = iloop * hrs_lcost + ihits * hrs_hcost; - exp_hrs = total_time_hrs / i * loops_left; - exp_twy = twy_icost + loops_left * twy_lcost; + else { + LOG("Move by table gap = %ld\n", gap); + i += gap; + } + if (dynamic) { + if (ihits - ihits_last < 100 && iloop - iloop_last < 100) { + continue; + } + ll = (double)(w - i + 1); + exp_hrs = ((double)iloop * hrs_lcost + + (double)ihits * hrs_hcost) / (double)i * ll; + exp_twy = twy_icost + ll * twy_lcost; if (exp_twy < exp_hrs) { STRINGLIB(_init_critical_fac)(&pw, dir); Py_ssize_t res; - if (dir == 1) { - res = STRINGLIB(_two_way)( - s + i, n - i, maxcount - count, mode, &pw); - } else { + if (reversed) { res = STRINGLIB(_two_way_rev)( s, n - i, maxcount - count, mode, &pw); + } else { + res = STRINGLIB(_two_way)( + s + i, n - i, maxcount - count, mode, &pw); } if (mode == FAST_SEARCH) { - return res == -1 ? -1 : (dir == 1 ? res + i : res); + return res == -1 ? -1 : (reversed ? res : res + i); } else { return res + count; } } + ihits_last = ihits; + iloop_last = iloop; } } // Loop Counter and False Hit Counter Logging @@ -1172,6 +1178,7 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, } } if (mode != FAST_RSEARCH) { + // return STRINGLIB(horspool_find_old)(s, n, p, m, maxcount, mode, 0); return STRINGLIB(horspool_find)(s, n, p, m, maxcount, mode, 1, 1); // return STRINGLIB(two_way_find)(s, n, p, m, maxcount, mode); } From 1b9bdc9c54c4371e8ee26189827bac87a37ebd52 Mon Sep 17 00:00:00 2001 From: "d.grigonis" Date: Thu, 6 Jun 2024 04:59:19 +0300 Subject: [PATCH 06/28] more conservative bloom --- Objects/stringlib/fastsearch.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 9ec7e6c47d792f..839d82280b8ad9 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -805,7 +805,7 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, unsigned long mask = 0; Py_ssize_t true_gap = 0; Py_ssize_t j_stop = m; - if (n >= 10 * m) { + if (n >= 30 * m) { j_stop = m_m1; true_gap = m; // Note: true_gap("___aa") = 1 @@ -863,9 +863,8 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, j_off = ip - p_end; for (j = 0; j < j_stop; j++) { ihits++; - LOG("a: %c\n", ss[j_off + jp]); - LOG("b: %c\n", p[jp]); jp = p_stt + (reversed ? -j : j); + LOG("Checking %c %c ?\n", ss[j_off + jp], p[jp]); if (ss[j_off + jp] != p[jp]) { break; } @@ -903,6 +902,7 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, (double)ihits * hrs_hcost) / (double)i * ll; exp_twy = twy_icost + ll * twy_lcost; if (exp_twy < exp_hrs) { + LOG("switching to two-way algorithm: n=%ld, m=%ld\n", n, m); STRINGLIB(_init_critical_fac)(&pw, dir); Py_ssize_t res; if (reversed) { From 982b5109d75c3a22009cf8fd3a83c93cf4672a96 Mon Sep 17 00:00:00 2001 From: "d.grigonis" Date: Thu, 6 Jun 2024 05:33:23 +0300 Subject: [PATCH 07/28] fix assertions --- Objects/stringlib/fastsearch.h | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 839d82280b8ad9..1c14eb5892ef46 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -442,9 +442,14 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, { // Crochemore and Perrin's (1991) Two-Way algorithm. // See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 - // Needle const Py_ssize_t len_needle = pw->len_needle; const STRINGLIB_CHAR *const needle = pw->needle; + if (mode == FAST_COUNT){ + LOG("Two-way Counting \"%s\" in \"%s\".\n", needle, haystack); + } + else { + LOG("Two-way Finding \"%s\" in \"%s\".\n", needle, haystack); + } // Cut & Period const Py_ssize_t cut = pw->cut; // const Py_ssize_t cut_idx = reversed ? len_needle - cut : cut; @@ -455,13 +460,6 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, SHIFT_TYPE *table = pw->table; const STRINGLIB_CHAR *window; // Log - LOG("===== Two-way: \"%s\" in \"%s\". =====\n", needle, haystack); - if (mode == FAST_COUNT){ - LOG("###### Counting \"%s\" in \"%s\".\n", needle, haystack); - } - else { - LOG("###### Finding \"%s\" in \"%s\".\n", needle, haystack); - } // Prepare Py_ssize_t count = 0; Py_ssize_t gap = pw->gap; @@ -743,8 +741,9 @@ STRINGLIB(two_way_find)(const STRINGLIB_CHAR *haystack, const STRINGLIB_CHAR *needle, Py_ssize_t len_needle, Py_ssize_t maxcount, - int mode, int dir) + int mode, int direction) { + int dir = direction < 0 ? -1 : 1; STRINGLIB(prework) pw; (&pw)->needle = needle; (&pw)->len_needle = len_needle; @@ -849,17 +848,17 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, shift = Py_MAX(table[s_last & TABLE_MASK], 1); } } - assert(s_last == p_last); } else { shift = table[s_last & TABLE_MASK]; - assert((s_last & TABLE_MASK) == (p_last & TABLE_MASK)); } if (shift != 0) { LOG("Shift %ld\n", shift); i += shift; continue; } + // assert(s_last == p_last); // true_gap + // assert((s_last & TABLE_MASK) == (p_last & TABLE_MASK)); // else j_off = ip - p_end; for (j = 0; j < j_stop; j++) { ihits++; From 4e9d278d7e9b6bf715294e5f51d2f17041b36ca8 Mon Sep 17 00:00:00 2001 From: "d.grigonis" Date: Thu, 6 Jun 2024 09:58:56 +0300 Subject: [PATCH 08/28] seamless reverse integration --- Objects/stringlib/fastsearch.h | 508 +++++++++++---------------------- 1 file changed, 161 insertions(+), 347 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 1c14eb5892ef46..fa0c271f5b3f6c 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -187,31 +187,26 @@ STRINGLIB(rfind_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch) # define LOG(...) printf(__VA_ARGS__) # define LOG_STRING(s, n) # define LOG_LINEUP() -# define LOG_LINEUP_REV() #elif LOG_LEVEL == 2 && STRINGLIB_SIZEOF_CHAR == 1 # define LOG(...) printf(__VA_ARGS__) # define LOG_STRING(s, n) printf("\"%.*s\"", (int)(n), s) -# define LOG_LINEUP() do { \ - LOG("> "); LOG_STRING(haystack, len_haystack); LOG("\n> "); \ - LOG("%*s",(int)(window_last - haystack + 1 - len_needle), ""); \ - LOG_STRING(needle, len_needle); LOG("\n"); \ -} while(0) -# define LOG_LINEUP_REV() do { \ - LOG("> "); LOG_STRING(haystack, len_haystack); LOG("\n> "); \ - LOG("%*s",(int)(window - haystack), ""); \ - LOG_STRING(needle, len_needle); LOG("\n"); \ +# define LOG_LINEUP() do { \ + if (n < 100) { \ + LOG("> "); LOG_STRING(s, n); \ + LOG("\n> "); LOG("%*s",(int)(ss - s + ip - p_end), ""); \ + LOG_STRING(p, m); LOG("\n"); \ + } \ } while(0) #else # define LOG(...) # define LOG_STRING(s, n) # define LOG_LINEUP() -# define LOG_LINEUP_REV() #endif Py_LOCAL_INLINE(Py_ssize_t) -STRINGLIB(_lex_search)(const STRINGLIB_CHAR *needle, - Py_ssize_t len_needle, +STRINGLIB(_lex_search)(const STRINGLIB_CHAR *p, + Py_ssize_t m, Py_ssize_t *return_period, int invert_alphabet, int dir) @@ -224,12 +219,12 @@ STRINGLIB(_lex_search)(const STRINGLIB_CHAR *needle, Py_ssize_t k = 0; // The period of the right half. Py_ssize_t period = 1; - Py_ssize_t stt = dir == 1 ? 0 : len_needle - 1; + Py_ssize_t stt = dir == 1 ? 0 : m - 1; STRINGLIB_CHAR a, b; - while (candidate + k < len_needle) { + while (candidate + k < m) { // each loop increases candidate + k + max_suffix - a = needle[stt + dir*(candidate + k)]; - b = needle[stt + dir*(max_suffix + k)]; + a = p[stt + dir*(candidate + k)]; + b = p[stt + dir*(max_suffix + k)]; // check if the suffix at candidate is better than max_suffix if (invert_alphabet ? (b < a) : (a < b)) { // Fell short of max_suffix. @@ -267,8 +262,8 @@ STRINGLIB(_lex_search)(const STRINGLIB_CHAR *needle, Py_LOCAL_INLINE(Py_ssize_t) -STRINGLIB(_factorize)(const STRINGLIB_CHAR *needle, - Py_ssize_t len_needle, +STRINGLIB(_factorize)(const STRINGLIB_CHAR *p, + Py_ssize_t m, Py_ssize_t *return_period, int dir) { @@ -322,8 +317,8 @@ STRINGLIB(_factorize)(const STRINGLIB_CHAR *needle, '4321', '' */ Py_ssize_t cut1, period1, cut2, period2, cut, period; - cut1 = STRINGLIB(_lex_search)(needle, len_needle, &period1, 0, dir); - cut2 = STRINGLIB(_lex_search)(needle, len_needle, &period2, 1, dir); + cut1 = STRINGLIB(_lex_search)(p, m, &period1, 0, dir); + cut2 = STRINGLIB(_lex_search)(p, m, &period2, 1, dir); // Take the later cut. if (cut1 > cut2) { period = period1; @@ -346,8 +341,8 @@ STRINGLIB(_factorize)(const STRINGLIB_CHAR *needle, #define TABLE_MASK (TABLE_SIZE - 1U) typedef struct STRINGLIB(_pre) { - const STRINGLIB_CHAR *needle; - Py_ssize_t len_needle; + const STRINGLIB_CHAR *p; + Py_ssize_t m; Py_ssize_t cut; Py_ssize_t period; Py_ssize_t gap; @@ -360,26 +355,26 @@ static void STRINGLIB(_init_bc_table_gs_gap)(STRINGLIB(prework) *pw, int dir) { // 1. Fill up a compressed Boyer-Moore "Bad Character" table - const STRINGLIB_CHAR *needle = pw->needle; - Py_ssize_t len_needle = pw->len_needle; - Py_ssize_t stt = dir == 1 ? 0 : len_needle - 1; - Py_ssize_t end = dir == 1 ? len_needle - 1 : 0; - Py_ssize_t not_found_shift = Py_MIN(len_needle, MAX_SHIFT); + const STRINGLIB_CHAR *p = pw->p; + Py_ssize_t m = pw->m; + Py_ssize_t stt = dir == 1 ? 0 : m - 1; + Py_ssize_t end = dir == 1 ? m - 1 : 0; + Py_ssize_t not_found_shift = Py_MIN(m, MAX_SHIFT); for (Py_ssize_t i = 0; i < (Py_ssize_t)TABLE_SIZE; i++) { pw->table[i] = Py_SAFE_DOWNCAST(not_found_shift, Py_ssize_t, SHIFT_TYPE); } - for (Py_ssize_t i = len_needle - not_found_shift; i < len_needle; i++) { - SHIFT_TYPE shift = Py_SAFE_DOWNCAST(len_needle - 1 - i, + for (Py_ssize_t i = m - not_found_shift; i < m; i++) { + SHIFT_TYPE shift = Py_SAFE_DOWNCAST(m - 1 - i, Py_ssize_t, SHIFT_TYPE); - pw->table[needle[stt + dir*i] & TABLE_MASK] = shift; + pw->table[p[stt + dir*i] & TABLE_MASK] = shift; } // 2. Initialize "Good Suffix" Last Character Gap // Note: gap("___aa") = 1 - pw->gap = len_needle; - STRINGLIB_CHAR last = needle[end] & TABLE_MASK; - for (Py_ssize_t i = 1; i < len_needle; i++) { - STRINGLIB_CHAR x = needle[end - dir*i] & TABLE_MASK; + pw->gap = m; + STRINGLIB_CHAR last = p[end] & TABLE_MASK; + for (Py_ssize_t i = 1; i < m; i++) { + STRINGLIB_CHAR x = p[end - dir*i] & TABLE_MASK; if (x == last) { pw->gap = i; break; @@ -393,31 +388,30 @@ static void STRINGLIB(_init_critical_fac)(STRINGLIB(prework) *pw, int dir) { // Calculate Critical Factorization - const STRINGLIB_CHAR *needle = pw->needle; - Py_ssize_t len_needle = pw->len_needle; + const STRINGLIB_CHAR *p = pw->p; + Py_ssize_t m = pw->m; Py_ssize_t cut, period; int is_periodic; - cut = STRINGLIB(_factorize)(needle, len_needle, &period, dir); - assert(cut + period <= len_needle); + cut = STRINGLIB(_factorize)(p, m, &period, dir); + assert(cut + period <= m); if (dir == 1) { - is_periodic = (0 == memcmp(needle, - needle + period, - cut * STRINGLIB_SIZEOF_CHAR)); + is_periodic = 0 == memcmp(p, p + period, cut * STRINGLIB_SIZEOF_CHAR); } else { - Py_ssize_t cut_idx = len_needle - cut; - is_periodic = (0 == memcmp(needle + cut_idx, - needle + cut_idx - period, + Py_ssize_t cut_idx = m - cut; + is_periodic = (0 == memcmp(p + cut_idx, p + cut_idx - period, cut * STRINGLIB_SIZEOF_CHAR)); } if (is_periodic) { - assert(cut <= len_needle/2); + assert(cut <= m/2); assert(cut < period); + LOG("Needle is periodic.\n"); } else { // A lower bound on the period // CLARIFY> An upper bound? - period = Py_MAX(cut, len_needle - cut) + 1; + period = Py_MAX(cut, m - cut) + 1; + LOG("Needle is not periodic.\n"); } pw->cut = cut; pw->period = period; @@ -425,337 +419,158 @@ STRINGLIB(_init_critical_fac)(STRINGLIB(prework) *pw, int dir) LOG("Cut: %ld & Period: %ld\n", cut, period); LOG("split: "); - LOG_STRING(needle, cut); + LOG_STRING(p, cut); LOG(" + "); - LOG_STRING(needle + cut, len_needle - cut); + LOG_STRING(p + cut, m - cut); LOG("\n"); } static Py_ssize_t -STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, - Py_ssize_t len_haystack, - Py_ssize_t maxcount, - int mode, - STRINGLIB(prework) *pw)//, - // int reversed) +STRINGLIB(_two_way)(const STRINGLIB_CHAR *s, Py_ssize_t n, + Py_ssize_t maxcount, int mode, + STRINGLIB(prework) *pw, int direction) { // Crochemore and Perrin's (1991) Two-Way algorithm. // See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 - const Py_ssize_t len_needle = pw->len_needle; - const STRINGLIB_CHAR *const needle = pw->needle; - if (mode == FAST_COUNT){ - LOG("Two-way Counting \"%s\" in \"%s\".\n", needle, haystack); + if (mode == FAST_COUNT) { + LOG("Two-way Counting \"%s\" in \"%s\".\n", pw->p, s); } else { - LOG("Two-way Finding \"%s\" in \"%s\".\n", needle, haystack); + LOG("Two-way Finding \"%s\" in \"%s\".\n", pw->p, s); } - // Cut & Period - const Py_ssize_t cut = pw->cut; - // const Py_ssize_t cut_idx = reversed ? len_needle - cut : cut; - Py_ssize_t period = pw->period; - // Windows - const STRINGLIB_CHAR *window_last = haystack + len_needle - 1; - const STRINGLIB_CHAR *const haystack_end = haystack + len_haystack; - SHIFT_TYPE *table = pw->table; - const STRINGLIB_CHAR *window; - // Log + + int dir = direction < 0 ? -1 : 1; + int reversed = dir < 0; + // Prepare - Py_ssize_t count = 0; - Py_ssize_t gap = pw->gap; - Py_ssize_t shift, i; + const STRINGLIB_CHAR *const p = pw->p; + const Py_ssize_t m = pw->m; + SHIFT_TYPE *table = pw->table; + const Py_ssize_t gap = pw->gap; + const Py_ssize_t cut = pw->cut; + int is_periodic = pw->is_periodic; + Py_ssize_t period = is_periodic ? pw->period : Py_MAX(gap, pw->period); + Py_ssize_t gap_jump_end = Py_MIN(m, cut + gap); + + // Direction Independent + const Py_ssize_t w = n - m; + const Py_ssize_t m_m1 = m - 1; + + // Direction Dependent + const Py_ssize_t p_stt = dir == 1 ? 0 : m - 1; + const Py_ssize_t s_stt = dir == 1 ? 0 : n - 1; + const Py_ssize_t p_end = dir == 1 ? m - 1 : 0; + const Py_ssize_t cut_idx = reversed ? m - cut : cut; + const Py_ssize_t dir_m_m1 = reversed ? -m_m1 : m_m1; + const STRINGLIB_CHAR *const ss = s + s_stt + dir_m_m1; + + // Indexers + Py_ssize_t i, j, ip, jp; + // Temporary + Py_ssize_t j_off; // offset from last to leftmost window index + Py_ssize_t shift; + int do_mem_jump = 0; + // Counters Py_ssize_t iloop = 0; Py_ssize_t ihits = 0; - Py_ssize_t gap_jump_end = Py_MIN(len_needle, cut + gap); - int is_periodic = pw->is_periodic; + Py_ssize_t count = 0; Py_ssize_t memory = 0; - if (is_periodic) { - LOG("Needle is periodic.\n"); - } - else { - LOG("Needle is not periodic.\n"); - period = Py_MAX(gap, period); - } - while (window_last < haystack_end) { - assert(memory == 0); - LOG_LINEUP(); + // Loop + for (i = 0; i <= w;) { iloop++; - shift = table[(*window_last) & TABLE_MASK]; - window_last += shift; + ip = reversed ? -i : i; + LOG("Last window ch: %c\n", ss[ip]); + LOG_LINEUP(); + shift = table[ss[ip] & TABLE_MASK]; if (shift != 0){ - LOG("Horspool skip\n"); + if (do_mem_jump) { + // A mismatch has been identified to the right + // of where i will next start, so we can jump + // at least as far as if the mismatch occurred + // on the first comparison. + shift = Py_MAX(shift, Py_MAX(1, memory - cut + 1)); + memory = 0; + LOG("Skip with Memory: %ld\n", shift); + } + assert(memory == 0); + LOG("Shift: %ld\n", shift); + i += shift; + do_mem_jump = 0; continue; } - no_shift: - if (window_last >= haystack_end){ - break; - } - window = window_last - len_needle + 1; - assert((window[len_needle - 1] & TABLE_MASK) == - (needle[len_needle - 1] & TABLE_MASK)); - if (is_periodic) { - i = Py_MAX(cut, memory); - } else { - i = cut; - } - for (; i < len_needle; i++) { + assert((ss[ip] & TABLE_MASK) == (p[m - 1] & TABLE_MASK)); + j_off = ip - p_end; + j = is_periodic ? Py_MAX(cut, memory) : cut; + for (; j < m; j++) { ihits++; - if (needle[i] != window[i]) { - if (i < gap_jump_end) { + jp = p_stt + (reversed ? -j : j); + LOG("Checking: %c vs %c\n", ss[j_off + jp], p[jp]); + if (ss[j_off + j] != p[j]) { + if (j < gap_jump_end) { LOG("Early right half mismatch: jump by gap.\n"); - assert(gap >= i - cut + 1); - window_last += gap; + assert(gap >= j - cut + 1); + i += gap; } else { LOG("Late right half mismatch: jump by n (>gap)\n"); - assert(i - cut + 1 > gap); - window_last += i - cut + 1; + assert(j - cut + 1 > gap); + i += j - cut + 1; } memory = 0; break; } } - if (i != len_needle){ + if (j != m) { continue; } - if (is_periodic) { - for (i = memory; i < cut; i++) { - ihits++; - if (needle[i] != window[i]) { - LOG("Left half does not match.\n"); - window_last += period; - memory = len_needle - period; - if (window_last >= haystack_end) { - break; - } - iloop++; - Py_ssize_t shift = table[(*window_last) & TABLE_MASK]; - if (shift != 0){ - // A mismatch has been identified to the right - // of where i will next start, so we can jump - // at least as far as if the mismatch occurred - // on the first comparison. - Py_ssize_t mem_jump = Py_MAX(cut, memory) - cut + 1; - LOG("Skip with Memory.\n"); - memory = 0; - window_last += Py_MAX(shift, mem_jump); - break; - } - goto no_shift; - } - } - } else { - for (i = 0; i < cut; i++) { - ihits++; - if (needle[i] != window[i]) { - LOG("Left half does not match.\n"); - window_last += period; - break; - } - } - } - if (i == cut) { - LOG("Found a match!\n"); - if (mode != FAST_COUNT) { - return window - haystack; - } - count++; - if (count == maxcount) { - return maxcount; - } - window_last += len_needle; - } - } - // Loop Counter and Memory Access Counter Logging (Used for calibration) - // In worst case scenario iloop == n - m - // iloop == ihits indicates linear performance for quadratic problems - LOG("iloop: %ld\n", iloop); - LOG("ihits: %ld\n", ihits); - if (mode == FAST_COUNT) { - LOG("Counting finished.\n"); - return count; - } - else { - LOG("Not found. Returning -1.\n"); - return -1; - } -} - - -static Py_ssize_t -STRINGLIB(_two_way_rev)(const STRINGLIB_CHAR *haystack, - Py_ssize_t len_haystack, - Py_ssize_t maxcount, - int mode, - STRINGLIB(prework) *pw) -{ - // Reversed Crochemore and Perrin's (1991) Two-Way algorithm. - // See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 - const Py_ssize_t len_needle = pw->len_needle; - const Py_ssize_t cut = pw->cut - 1; - // TODO> - const Py_ssize_t left_len = len_needle - pw->cut; - // TODO> - Py_ssize_t period = pw->period; - const STRINGLIB_CHAR *const needle = pw->needle; - const STRINGLIB_CHAR *const haystack_end = haystack + len_haystack; - const STRINGLIB_CHAR *window = haystack_end - len_needle; - SHIFT_TYPE *table = pw->table; - LOG("===== Two-way: \"%s\" in \"%s\". =====\n", needle, haystack); - if (mode == FAST_COUNT){ - LOG("###### Counting \"%s\" in \"%s\".\n", needle, haystack); - } - else { - LOG("###### Finding \"%s\" in \"%s\".\n", needle, haystack); - } - Py_ssize_t inv_cut; - Py_ssize_t count = 0; - Py_ssize_t gap = pw->gap; - Py_ssize_t shift, i; - Py_ssize_t iloop = 0; - Py_ssize_t ihits = 0; - Py_ssize_t gap_jump_end = Py_MAX(0, cut - gap); - int is_periodic = pw->is_periodic; - Py_ssize_t memory = 0; - if (is_periodic) { - LOG("Needle is periodic.\n"); - } - else { - LOG("Needle is not periodic.\n"); - period = Py_MAX(gap, period); - } - while (window >= haystack) { - assert(memory == 0); - LOG_LINEUP_REV(); - iloop++; - shift = table[(*window) & TABLE_MASK]; - window -= shift; - if (shift != 0){ - LOG("Horspool skip\n"); - continue; - } - no_shift: - if (window < haystack){ - break; - } - assert((window[0] & TABLE_MASK) == (needle[0] & TABLE_MASK)); - if (is_periodic) { - i = Py_MIN(cut, len_needle - memory); - } else { - i = cut; - } - for (; i >= 0; i--) { + for (j = memory; j < cut; j++) { ihits++; - if (needle[i] != window[i]) { - if (i > gap_jump_end) { - LOG("Early left half mismatch: jump by gap.\n"); - assert(gap >= cut - i + 1); - window -= gap; - } - else { - LOG("Late left half mismatch: jump by n (>gap)\n"); - assert(cut - i + 1 > gap); - window -= cut - i + 1; + jp = p_stt + (reversed ? -j : j); + LOG("Checking: %c vs %c\n", ss[j_off + jp], p[jp]); + if (ss[j_off + j] != p[j]) { + LOG("Left half does not match.\n"); + if (is_periodic) { + memory = m - period; + do_mem_jump = 1; } - memory = 0; + i += period; break; } } - if (i != -1){ - continue; - } - - if (is_periodic) { - for (i = len_needle - 1 - memory; i > cut; i--) { - ihits++; - if (needle[i] != window[i]) { - LOG("Right half does not match.\n"); - window -= period; - memory = len_needle - period; - if (window < haystack) { - break; - } - iloop++; - Py_ssize_t shift = table[(*window) & TABLE_MASK]; - if (shift != 0){ - // A mismatch has been identified to the right - // of where i will next start, so we can jump - // at least as far as if the mismatch occurred - // on the first comparison. - // REF> - // 0 m + - // [0 1 2 3] - // m 0 - inv_cut = len_needle - cut - 1; - Py_ssize_t mem_jump = Py_MAX(inv_cut, memory) - inv_cut + 1; - LOG("Skip with Memory.\n"); - memory = 0; - window -= Py_MAX(shift, mem_jump); - break; - } - goto no_shift; - } - } - } - else { - for (i = len_needle - 1; i > cut; i--) { - ihits++; - if (needle[i] != window[i]) { - LOG("Right half does not match.\n"); - window -= period; - break; - } - } - } - if (i == cut) { + if (j == cut) { LOG("Found a match!\n"); if (mode != FAST_COUNT) { - return window - haystack; + return reversed ? n - m - i : i; } - count++; - if (count == maxcount) { + if (++count == maxcount) { return maxcount; } - window -= len_needle; + i += m; } + } // Loop Counter and Memory Access Counter Logging (Used for calibration) // In worst case scenario iloop == n - m // iloop == ihits indicates linear performance for quadratic problems LOG("iloop: %ld\n", iloop); LOG("ihits: %ld\n", ihits); - if (mode == FAST_COUNT) { - LOG("Counting finished.\n"); - return count; - } - else { - LOG("Not found. Returning -1.\n"); - return -1; - } + return mode == FAST_COUNT ? count : -1; } static Py_ssize_t -STRINGLIB(two_way_find)(const STRINGLIB_CHAR *haystack, - Py_ssize_t len_haystack, - const STRINGLIB_CHAR *needle, - Py_ssize_t len_needle, - Py_ssize_t maxcount, - int mode, int direction) +STRINGLIB(two_way_find)(const STRINGLIB_CHAR *s, Py_ssize_t n, + const STRINGLIB_CHAR *p, Py_ssize_t m, + Py_ssize_t maxcount, int mode, int direction) { int dir = direction < 0 ? -1 : 1; STRINGLIB(prework) pw; - (&pw)->needle = needle; - (&pw)->len_needle = len_needle; + (&pw)->p = p; + (&pw)->m = m; STRINGLIB(_init_bc_table_gs_gap)(&pw, dir); STRINGLIB(_init_critical_fac)(&pw, dir); - if (dir == 1) { - return STRINGLIB(_two_way)(haystack, len_haystack, maxcount, mode, &pw); - } - else { - return STRINGLIB(_two_way_rev)(haystack, len_haystack, maxcount, mode, &pw); - } - + return STRINGLIB(_two_way)(s, n, maxcount, mode, &pw, dir); } @@ -767,7 +582,7 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, { /* Boyer–Moore–Horspool algorithm with optional dynamic fallback to Two-Way algorithm */ - if (mode == FAST_COUNT){ + if (mode == FAST_COUNT) { LOG("Horspool Counting \"%s\" in \"%s\".\n", p, s); } else { @@ -776,19 +591,20 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, int dir = direction < 0 ? -1 : 1; int reversed = dir < 0; + // Prepare STRINGLIB(prework) pw; - (&pw)->needle = p; - (&pw)->len_needle = m; + (&pw)->p = p; + (&pw)->m = m; STRINGLIB(_init_bc_table_gs_gap)(&pw, dir); Py_ssize_t gap = (&pw)->gap; SHIFT_TYPE *table = (&pw)->table; - // Direction Agnostic + // Direction Independent const Py_ssize_t m_m1 = m - 1; const Py_ssize_t m_p1 = m + 1; const Py_ssize_t w = n - m; - // Direction Sensitive + // Direction Dependent const Py_ssize_t s_stt = dir == 1 ? 0 : n - 1; const Py_ssize_t p_stt = dir == 1 ? 0 : m - 1; const Py_ssize_t p_end = dir == 1 ? m - 1 : 0; @@ -796,9 +612,8 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, const STRINGLIB_CHAR *const ss = s + s_stt + dir_m_m1; const STRINGLIB_CHAR p_last = p[p_end]; - // Temporary Variables - Py_ssize_t shift, i, ip, jp, j, j_off; - STRINGLIB_CHAR s_last; + // Indexers + Py_ssize_t i, j, ip, jp; // Use Bloom for len(haystack) >= 10 * len(needle) unsigned long mask = 0; @@ -828,20 +643,25 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, // Two-Way Calibration const double twy_icost = 3.0 * (double)m; // total initialization cost const double twy_lcost = 3.0; // loop cost - // Running variables - double exp_hrs, exp_twy, ll; // expected run times & loops left + // Temporary + double exp_hrs, exp_twy, ll; // expected run times & loops left + Py_ssize_t j_off; // offset from last to leftmost window index + Py_ssize_t shift; + STRINGLIB_CHAR s_last; + // Counters Py_ssize_t count = 0; Py_ssize_t iloop = 0, ihits_last = 0; Py_ssize_t ihits = 0, iloop_last = 0; + // Loop for (i = 0; i <= w;) { iloop++; ip = reversed ? -i : i; s_last = ss[ip]; - LOG("s_last: %c\n", s_last); + LOG("Last window ch: %c\n", s_last); if (true_gap) { shift = 0; if (s_last != p_last) { - if (i < w && !STRINGLIB_BLOOM(mask, ss[ip+dir])){ + if (i < w && !STRINGLIB_BLOOM(mask, ss[ip+dir])) { shift = m_p1; } else { @@ -853,7 +673,7 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, shift = table[s_last & TABLE_MASK]; } if (shift != 0) { - LOG("Shift %ld\n", shift); + LOG("Shift: %ld\n", shift); i += shift; continue; } @@ -863,7 +683,7 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, for (j = 0; j < j_stop; j++) { ihits++; jp = p_stt + (reversed ? -j : j); - LOG("Checking %c %c ?\n", ss[j_off + jp], p[jp]); + LOG("Checking: %c vs %c\n", ss[j_off + jp], p[jp]); if (ss[j_off + jp] != p[jp]) { break; } @@ -903,14 +723,9 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, if (exp_twy < exp_hrs) { LOG("switching to two-way algorithm: n=%ld, m=%ld\n", n, m); STRINGLIB(_init_critical_fac)(&pw, dir); - Py_ssize_t res; - if (reversed) { - res = STRINGLIB(_two_way_rev)( - s, n - i, maxcount - count, mode, &pw); - } else { - res = STRINGLIB(_two_way)( - s + i, n - i, maxcount - count, mode, &pw); - } + Py_ssize_t res = STRINGLIB(_two_way)( + reversed ? s : s + i, n - i, + maxcount - count, mode, &pw, dir); if (mode == FAST_SEARCH) { return res == -1 ? -1 : (reversed ? res : res + i); } @@ -930,7 +745,6 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, return mode == FAST_COUNT ? count : -1; } - #undef SHIFT_TYPE #undef NOT_FOUND #undef SHIFT_OVERFLOW @@ -1178,12 +992,12 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, } if (mode != FAST_RSEARCH) { // return STRINGLIB(horspool_find_old)(s, n, p, m, maxcount, mode, 0); - return STRINGLIB(horspool_find)(s, n, p, m, maxcount, mode, 1, 1); - // return STRINGLIB(two_way_find)(s, n, p, m, maxcount, mode); + // return STRINGLIB(horspool_find)(s, n, p, m, maxcount, mode, 1, 1); + return STRINGLIB(two_way_find)(s, n, p, m, maxcount, mode, 1); } else { /* FAST_RSEARCH */ - return STRINGLIB(horspool_find)(s, n, p, m, maxcount, mode, -1, 1); - // return STRINGLIB(two_way_rfind)(s, n, p, m, maxcount, mode); + // return STRINGLIB(horspool_find)(s, n, p, m, maxcount, mode, -1, 1); + return STRINGLIB(two_way_find)(s, n, p, m, maxcount, mode, -1); } } From c8e1cc540dd8f3fbbc279ff1b23643675cf48f9a Mon Sep 17 00:00:00 2001 From: "d.grigonis" Date: Thu, 6 Jun 2024 11:42:02 +0300 Subject: [PATCH 09/28] ready for review --- Objects/stringlib/fastsearch.h | 65 ++++++++++++++++++---------------- 1 file changed, 35 insertions(+), 30 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index fa0c271f5b3f6c..f17658d8c0e48b 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -185,11 +185,17 @@ STRINGLIB(rfind_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch) # define LOG_LEVEL 0 #if LOG_LEVEL == 1 && STRINGLIB_SIZEOF_CHAR == 1 # define LOG(...) printf(__VA_ARGS__) -# define LOG_STRING(s, n) +# define LOG2(...) +# define LOG_STRING(s, n) if (n < 100) { \ + printf("\"%.*s\"", (int)(n), s); \ +} # define LOG_LINEUP() #elif LOG_LEVEL == 2 && STRINGLIB_SIZEOF_CHAR == 1 # define LOG(...) printf(__VA_ARGS__) -# define LOG_STRING(s, n) printf("\"%.*s\"", (int)(n), s) +# define LOG2(...) printf(__VA_ARGS__) +# define LOG_STRING(s, n) if (n < 100) { \ + printf("\"%.*s\"", (int)(n), s); \ +} # define LOG_LINEUP() do { \ if (n < 100) { \ LOG("> "); LOG_STRING(s, n); \ @@ -199,6 +205,7 @@ STRINGLIB(rfind_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch) } while(0) #else # define LOG(...) +# define LOG2(...) # define LOG_STRING(s, n) # define LOG_LINEUP() #endif @@ -434,12 +441,13 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *s, Py_ssize_t n, // Crochemore and Perrin's (1991) Two-Way algorithm. // See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 if (mode == FAST_COUNT) { - LOG("Two-way Counting \"%s\" in \"%s\".\n", pw->p, s); + LOG("Two-way Count.\n"); } else { - LOG("Two-way Finding \"%s\" in \"%s\".\n", pw->p, s); + LOG("Two-way Find.\n"); } - + LOG("haystack: "); LOG_STRING(s, n); LOG("\n"); + LOG("needle : "); LOG_STRING(pw->p, pw->m); LOG("\n"); int dir = direction < 0 ? -1 : 1; int reversed = dir < 0; @@ -480,10 +488,10 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *s, Py_ssize_t n, for (i = 0; i <= w;) { iloop++; ip = reversed ? -i : i; - LOG("Last window ch: %c\n", ss[ip]); + LOG2("Last window ch: %c\n", ss[ip]); LOG_LINEUP(); shift = table[ss[ip] & TABLE_MASK]; - if (shift != 0){ + if (shift){ if (do_mem_jump) { // A mismatch has been identified to the right // of where i will next start, so we can jump @@ -505,15 +513,15 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *s, Py_ssize_t n, for (; j < m; j++) { ihits++; jp = p_stt + (reversed ? -j : j); - LOG("Checking: %c vs %c\n", ss[j_off + jp], p[jp]); - if (ss[j_off + j] != p[j]) { + LOG2("Checking j=%ld: %c vs %c\n", j, ss[j_off + jp], p[jp]); + if (ss[j_off + jp] != p[jp]) { if (j < gap_jump_end) { - LOG("Early right half mismatch: jump by gap.\n"); + LOG("Early later half mismatch: jump by gap.\n"); assert(gap >= j - cut + 1); i += gap; } else { - LOG("Late right half mismatch: jump by n (>gap)\n"); + LOG("Late later half mismatch: jump by n (>gap)\n"); assert(j - cut + 1 > gap); i += j - cut + 1; } @@ -524,12 +532,13 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *s, Py_ssize_t n, if (j != m) { continue; } - for (j = memory; j < cut; j++) { + j = Py_MIN(memory, cut); + for (; j < cut; j++) { ihits++; jp = p_stt + (reversed ? -j : j); - LOG("Checking: %c vs %c\n", ss[j_off + jp], p[jp]); - if (ss[j_off + j] != p[j]) { - LOG("Left half does not match.\n"); + LOG2("Checking j=%ld: %c vs %c\n", j, ss[j_off + jp], p[jp]); + if (ss[j_off + jp] != p[jp]) { + LOG("First half does not match.\n"); if (is_periodic) { memory = m - period; do_mem_jump = 1; @@ -546,6 +555,7 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *s, Py_ssize_t n, if (++count == maxcount) { return maxcount; } + memory = 0; i += m; } @@ -583,11 +593,13 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, /* Boyer–Moore–Horspool algorithm with optional dynamic fallback to Two-Way algorithm */ if (mode == FAST_COUNT) { - LOG("Horspool Counting \"%s\" in \"%s\".\n", p, s); + LOG("Horspool Count.\n"); } else { - LOG("Horspool Finding \"%s\" in \"%s\".\n", p, s); + LOG("Horspool Find\n"); } + LOG("haystack: "); LOG_STRING(s, n); LOG("\n"); + LOG("needle : "); LOG_STRING(p, m); LOG("\n"); int dir = direction < 0 ? -1 : 1; int reversed = dir < 0; @@ -657,7 +669,7 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, iloop++; ip = reversed ? -i : i; s_last = ss[ip]; - LOG("Last window ch: %c\n", s_last); + LOG2("Last window ch: %c\n", s_last); if (true_gap) { shift = 0; if (s_last != p_last) { @@ -672,7 +684,7 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, else { shift = table[s_last & TABLE_MASK]; } - if (shift != 0) { + if (shift) { LOG("Shift: %ld\n", shift); i += shift; continue; @@ -683,7 +695,7 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, for (j = 0; j < j_stop; j++) { ihits++; jp = p_stt + (reversed ? -j : j); - LOG("Checking: %c vs %c\n", ss[j_off + jp], p[jp]); + LOG2("Checking j=%ld: %c vs %c\n", j, ss[j_off + jp], p[jp]); if (ss[j_off + jp] != p[jp]) { break; } @@ -990,14 +1002,7 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, return res == 0 ? 0 : -1; } } - if (mode != FAST_RSEARCH) { - // return STRINGLIB(horspool_find_old)(s, n, p, m, maxcount, mode, 0); - // return STRINGLIB(horspool_find)(s, n, p, m, maxcount, mode, 1, 1); - return STRINGLIB(two_way_find)(s, n, p, m, maxcount, mode, 1); - } - else { - /* FAST_RSEARCH */ - // return STRINGLIB(horspool_find)(s, n, p, m, maxcount, mode, -1, 1); - return STRINGLIB(two_way_find)(s, n, p, m, maxcount, mode, -1); - } + int dyn = 1; + int dir = mode != FAST_RSEARCH ? 1 : -1; + return STRINGLIB(horspool_find)(s, n, p, m, maxcount, mode, dir, dyn); } From b5bd4c520a12d0878a173553c707970e0ffe54d5 Mon Sep 17 00:00:00 2001 From: "d.grigonis" Date: Thu, 6 Jun 2024 14:36:47 +0300 Subject: [PATCH 10/28] rm variable & add comment --- Objects/stringlib/fastsearch.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index f17658d8c0e48b..9fad6d6e428a20 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -469,7 +469,6 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *s, Py_ssize_t n, const Py_ssize_t p_stt = dir == 1 ? 0 : m - 1; const Py_ssize_t s_stt = dir == 1 ? 0 : n - 1; const Py_ssize_t p_end = dir == 1 ? m - 1 : 0; - const Py_ssize_t cut_idx = reversed ? m - cut : cut; const Py_ssize_t dir_m_m1 = reversed ? -m_m1 : m_m1; const STRINGLIB_CHAR *const ss = s + s_stt + dir_m_m1; @@ -532,7 +531,7 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *s, Py_ssize_t n, if (j != m) { continue; } - j = Py_MIN(memory, cut); + j = Py_MIN(memory, cut); // Needed for j == cut below to be correct for (; j < cut; j++) { ihits++; jp = p_stt + (reversed ? -j : j); From 8c87a574666fd370416ca98e7ec766e26d2e9142 Mon Sep 17 00:00:00 2001 From: "d.grigonis" Date: Thu, 6 Jun 2024 19:10:07 +0300 Subject: [PATCH 11/28] styling, comments and PR feedback --- Objects/stringlib/fastsearch.h | 172 +++++++++++++++++++++++++++------ 1 file changed, 140 insertions(+), 32 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 9fad6d6e428a20..753c9b13c98d62 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -220,16 +220,40 @@ STRINGLIB(_lex_search)(const STRINGLIB_CHAR *p, { /* Do a lexicographic search. Essentially this: >>> max(needle[i:] for i in range(len(needle)+1)) - Also find the period of the right half. */ + Also find the period of the right half. + Direction: + dir : {-1, 1} + if dir == -1, then the problem is reverse + In short: + _lex_search(x, -1) == _lex_search(x[::-1], 1) + + Returned cut is "the size of the cut towards chosen direction". + E.g.: + >>> x = '1234' + >>> cut, period = factorize(x, dir=1) # cut = 0 + >>> cut + 0 + >>> cut_idx = cut + >>> x[:cut_idx], x[cut_idx:] + '', '1234' + >>> x = '4321' + >>> cut, period = factorize(x, dir=-1) + >>> cut + 0 + >>> cut_idx = len(x) - cut + >>> x[:cut_idx], x[cut_idx:] + '4321', '' + */ Py_ssize_t max_suffix = 0; Py_ssize_t candidate = 1; Py_ssize_t k = 0; // The period of the right half. Py_ssize_t period = 1; + // stt is starting position from chosen direction Py_ssize_t stt = dir == 1 ? 0 : m - 1; STRINGLIB_CHAR a, b; while (candidate + k < m) { - // each loop increases candidate + k + max_suffix + // each loop increases (in chosen direction) candidate + k + max_suffix a = p[stt + dir*(candidate + k)]; b = p[stt + dir*(max_suffix + k)]; // check if the suffix at candidate is better than max_suffix @@ -306,22 +330,12 @@ STRINGLIB(_factorize)(const STRINGLIB_CHAR *p, The length of this minimal repetition is 7, which is indeed the period of the original string. - This is how reverse direction compares to forward: - returned cut is "the size of the cut from the start point". E.g.: - >>> x = '1234' - >>> cut, period = factorize(x, 1) # cut = 0 - >>> cut - 0 - >>> cut_idx = cut - >>> x[:cut_idx], x[cut_idx:] - '', '1234' - >>> x = '4321' - >>> cut, period = factorize(x, -1) - >>> cut - 0 - >>> cut_idx = len(x) - cut - >>> x[:cut_idx], x[cut_idx:] - '4321', '' + Direction: + dir : {-1, 1} + if dir == -1, then the problem is reverse + In short: + _factorize(x, -1) == _factorize(x[::-1], 1) + See docstring of _lex_search if still unclear */ Py_ssize_t cut1, period1, cut2, period2, cut, period; cut1 = STRINGLIB(_lex_search)(p, m, &period1, 0, dir); @@ -348,13 +362,13 @@ STRINGLIB(_factorize)(const STRINGLIB_CHAR *p, #define TABLE_MASK (TABLE_SIZE - 1U) typedef struct STRINGLIB(_pre) { - const STRINGLIB_CHAR *p; - Py_ssize_t m; - Py_ssize_t cut; - Py_ssize_t period; - Py_ssize_t gap; + const STRINGLIB_CHAR *p; // needle + Py_ssize_t m; // length of the needle + Py_ssize_t cut; // Critical Factorization Cut + Py_ssize_t period; // Global Period of the string + Py_ssize_t gap; // "Good Suffix" Last Character Gap int is_periodic; - SHIFT_TYPE table[TABLE_SIZE]; + SHIFT_TYPE table[TABLE_SIZE]; // Boyer-Moore "Bad Character" table } STRINGLIB(prework); @@ -438,8 +452,14 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *s, Py_ssize_t n, Py_ssize_t maxcount, int mode, STRINGLIB(prework) *pw, int direction) { - // Crochemore and Perrin's (1991) Two-Way algorithm. - // See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 + /* Crochemore and Perrin's (1991) Two-Way algorithm. + See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 + Bi-Directional Conventions: + See docstring of horspool_find + + Critical factorization reversion: + See docstring of _factorize + */ if (mode == FAST_COUNT) { LOG("Two-way Count.\n"); } @@ -464,7 +484,6 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *s, Py_ssize_t n, // Direction Independent const Py_ssize_t w = n - m; const Py_ssize_t m_m1 = m - 1; - // Direction Dependent const Py_ssize_t p_stt = dir == 1 ? 0 : m - 1; const Py_ssize_t s_stt = dir == 1 ? 0 : n - 1; @@ -531,6 +550,7 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *s, Py_ssize_t n, if (j != m) { continue; } + j = Py_MIN(memory, cut); // Needed for j == cut below to be correct for (; j < cut; j++) { ihits++; @@ -590,7 +610,93 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, int direction, int dynamic) { /* Boyer–Moore–Horspool algorithm - with optional dynamic fallback to Two-Way algorithm */ + with optional dynamic fallback to Two-Way algorithm + Bi-Directional Conventions: + stt - start index + end - end index + ss - pointer to last window index that matches last needle character + >>> dir_fwd, dir_rev = 1, -1 + >>> s = [0, 1, 2, 3, 4, 5] + >>> s_stt_fwd, s_stt_rev = 0, 5 + >>> s_end_fwd, s_end_rev = 5, 0 + >>> p = [0, 1] + >>> m = len(p) + >>> s = 0 + >>> ss_fwd = s + s_stt_fwd + dir_fwd * (m - 1) + >>> ss_rev = s + s_stt_rev + dir_rev * (m - 1) + >>> ss_fwd, ss_rev + (1, 4) + + There is one more important variable here: j_off + It brings ss in alignment with a needle. + So that it stands at the first absolute index of the window + + >>> i = 0 # first step + >>> p_stt_fwd, p_stt_rev = 0, 1 + >>> p_end_fwd, p_end_rev = 1, 0 + >>> j_off_fwd = dir_fwd * i - p_end_fwd + >>> ss_fwd + j_off_fwd + 0 + + such that [0, 1, 2, 3, 4, 5] + [0, 1] + * - both indices are at 0 here + + >>> j_off_rev = dir_rev * i - p_end_rev + >>> ss_rev + j_off_rev + 4 + + such that [0, 1, 2, 3, 4, 5] + [0, 1] + * - both indices are at 0 here + Finally, which side it iterates from is determined by: + jp = p_stt + (reversed ? -j : j); + + With this transformation the problem becomes direction agnostic + + Dynamic mode + 'Horspool' algorithm will switch to `two_way_find` if it predicts + that it can solve the problem faster. + + Calibration + The simple model for run time of search algorithm is as follows: + loop - actual loop that happens (not theoretical) + init_cost - initialization cost per 1 needle character in ns + loop_cost - cost of 1 main loop + hit_cost - cost of 1 false positive character check + avg_hit - average number of false positive hits per 1 loop + + >>> m = len(needle) + >>> run_time = m * m + n_loops * (loop_cost + hit_cost * avg_hit) + + Calibrate: + 1. expose function to run without handling special cases first. + 2. set dynamic = 0 + 3. Enable counter printing to know how many hits and loops happened + iloop & ihits at the end of the function + + 4. init_cost = run_time(horspool_find(s='', p='*' * m)) / m + + 5. `two_way` only has loop cost. + run_time(two_way_find(s='*' * 1000)) - init_cost + loop_cost = ------------------------------------------------ + n_loops (from stdout) + Note, iloop & ihits of `two_way` should be the same. + + 6. To get loop_cost and hit_cost of `horspool_find` solve + equation system representing 2 different runs + n_loops1 * loop_cost + n_hits1 * hit_cost = run_time(problem_1) + n_loops2 * loop_cost + n_hits2 * hit_cost = run_time(problem_2) + + init_cost of `horspool` for larger problems is negligible + Furthermore, it is not used from within as it has already happened + + 7. Run above for different problems. if results differ take averages + Compare with current calibration constants + + 8. It works well, but is not perfect. + See if you can come up with more accurate model. + */ if (mode == FAST_COUNT) { LOG("Horspool Count.\n"); } @@ -614,7 +720,6 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, const Py_ssize_t m_m1 = m - 1; const Py_ssize_t m_p1 = m + 1; const Py_ssize_t w = n - m; - // Direction Dependent const Py_ssize_t s_stt = dir == 1 ? 0 : n - 1; const Py_ssize_t p_stt = dir == 1 ? 0 : m - 1; @@ -688,6 +793,7 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, i += shift; continue; } + // assert(s_last == p_last); // true_gap // assert((s_last & TABLE_MASK) == (p_last & TABLE_MASK)); // else j_off = ip - p_end; @@ -723,6 +829,7 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, LOG("Move by table gap = %ld\n", gap); i += gap; } + if (dynamic) { if (ihits - ihits_last < 100 && iloop - iloop_last < 100) { continue; @@ -1001,7 +1108,8 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, return res == 0 ? 0 : -1; } } - int dyn = 1; - int dir = mode != FAST_RSEARCH ? 1 : -1; - return STRINGLIB(horspool_find)(s, n, p, m, maxcount, mode, dir, dyn); + int dynamic = 1; + int direction = mode != FAST_RSEARCH ? 1 : -1; + return STRINGLIB(horspool_find)(s, n, p, m, maxcount, mode, + direction, dynamic); } From 4667880894ca89046690112bf1f47858232ad9ff Mon Sep 17 00:00:00 2001 From: "d.grigonis" Date: Thu, 6 Jun 2024 20:00:05 +0300 Subject: [PATCH 12/28] style mostly --- Objects/stringlib/fastsearch.h | 225 +++++++++++++++++---------------- 1 file changed, 117 insertions(+), 108 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 753c9b13c98d62..cb45a3d69fff2f 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -416,12 +416,12 @@ STRINGLIB(_init_critical_fac)(STRINGLIB(prework) *pw, int dir) cut = STRINGLIB(_factorize)(p, m, &period, dir); assert(cut + period <= m); if (dir == 1) { - is_periodic = 0 == memcmp(p, p + period, cut * STRINGLIB_SIZEOF_CHAR); + is_periodic = memcmp(p, p + period, cut * STRINGLIB_SIZEOF_CHAR) == 0; } else { Py_ssize_t cut_idx = m - cut; - is_periodic = (0 == memcmp(p + cut_idx, p + cut_idx - period, - cut * STRINGLIB_SIZEOF_CHAR)); + is_periodic = (memcmp(p + cut_idx, p + cut_idx - period, + cut * STRINGLIB_SIZEOF_CHAR) == 0); } if (is_periodic) { assert(cut <= m/2); @@ -450,15 +450,22 @@ STRINGLIB(_init_critical_fac)(STRINGLIB(prework) *pw, int dir) static Py_ssize_t STRINGLIB(_two_way)(const STRINGLIB_CHAR *s, Py_ssize_t n, Py_ssize_t maxcount, int mode, - STRINGLIB(prework) *pw, int direction) + STRINGLIB(prework) *pw, int dir) { /* Crochemore and Perrin's (1991) Two-Way algorithm. See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 + + Main Inputs + s - haystack + p - needle + n - len(haystack) + m - len(needle) + Bi-Directional Conventions: - See docstring of horspool_find + See docstring of horspool_find Critical factorization reversion: - See docstring of _factorize + See docstring of _factorize */ if (mode == FAST_COUNT) { LOG("Two-way Count.\n"); @@ -468,7 +475,7 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *s, Py_ssize_t n, } LOG("haystack: "); LOG_STRING(s, n); LOG("\n"); LOG("needle : "); LOG_STRING(pw->p, pw->m); LOG("\n"); - int dir = direction < 0 ? -1 : 1; + dir = dir < 0 ? -1 : 1; // This could help compiler a bit int reversed = dir < 0; // Prepare @@ -491,7 +498,7 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *s, Py_ssize_t n, const Py_ssize_t dir_m_m1 = reversed ? -m_m1 : m_m1; const STRINGLIB_CHAR *const ss = s + s_stt + dir_m_m1; - // Indexers + // Indexers (ip and jp are directional indices. e.g. ip = pos * i) Py_ssize_t i, j, ip, jp; // Temporary Py_ssize_t j_off; // offset from last to leftmost window index @@ -591,9 +598,8 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *s, Py_ssize_t n, static Py_ssize_t STRINGLIB(two_way_find)(const STRINGLIB_CHAR *s, Py_ssize_t n, const STRINGLIB_CHAR *p, Py_ssize_t m, - Py_ssize_t maxcount, int mode, int direction) + Py_ssize_t maxcount, int mode, int dir) { - int dir = direction < 0 ? -1 : 1; STRINGLIB(prework) pw; (&pw)->p = p; (&pw)->m = m; @@ -607,95 +613,102 @@ static Py_ssize_t STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, const STRINGLIB_CHAR* p, Py_ssize_t m, Py_ssize_t maxcount, int mode, - int direction, int dynamic) + int dir, int dynamic) { /* Boyer–Moore–Horspool algorithm with optional dynamic fallback to Two-Way algorithm - Bi-Directional Conventions: - stt - start index - end - end index - ss - pointer to last window index that matches last needle character - >>> dir_fwd, dir_rev = 1, -1 - >>> s = [0, 1, 2, 3, 4, 5] - >>> s_stt_fwd, s_stt_rev = 0, 5 - >>> s_end_fwd, s_end_rev = 5, 0 - >>> p = [0, 1] - >>> m = len(p) - >>> s = 0 - >>> ss_fwd = s + s_stt_fwd + dir_fwd * (m - 1) - >>> ss_rev = s + s_stt_rev + dir_rev * (m - 1) - >>> ss_fwd, ss_rev - (1, 4) - - There is one more important variable here: j_off - It brings ss in alignment with a needle. - So that it stands at the first absolute index of the window - - >>> i = 0 # first step - >>> p_stt_fwd, p_stt_rev = 0, 1 - >>> p_end_fwd, p_end_rev = 1, 0 - >>> j_off_fwd = dir_fwd * i - p_end_fwd - >>> ss_fwd + j_off_fwd - 0 - - such that [0, 1, 2, 3, 4, 5] - [0, 1] - * - both indices are at 0 here - - >>> j_off_rev = dir_rev * i - p_end_rev - >>> ss_rev + j_off_rev - 4 - - such that [0, 1, 2, 3, 4, 5] - [0, 1] - * - both indices are at 0 here - Finally, which side it iterates from is determined by: - jp = p_stt + (reversed ? -j : j); - - With this transformation the problem becomes direction agnostic - - Dynamic mode - 'Horspool' algorithm will switch to `two_way_find` if it predicts - that it can solve the problem faster. - - Calibration - The simple model for run time of search algorithm is as follows: - loop - actual loop that happens (not theoretical) - init_cost - initialization cost per 1 needle character in ns - loop_cost - cost of 1 main loop - hit_cost - cost of 1 false positive character check - avg_hit - average number of false positive hits per 1 loop - - >>> m = len(needle) - >>> run_time = m * m + n_loops * (loop_cost + hit_cost * avg_hit) - - Calibrate: - 1. expose function to run without handling special cases first. - 2. set dynamic = 0 - 3. Enable counter printing to know how many hits and loops happened - iloop & ihits at the end of the function - - 4. init_cost = run_time(horspool_find(s='', p='*' * m)) / m - - 5. `two_way` only has loop cost. - run_time(two_way_find(s='*' * 1000)) - init_cost - loop_cost = ------------------------------------------------ - n_loops (from stdout) - Note, iloop & ihits of `two_way` should be the same. - - 6. To get loop_cost and hit_cost of `horspool_find` solve - equation system representing 2 different runs - n_loops1 * loop_cost + n_hits1 * hit_cost = run_time(problem_1) - n_loops2 * loop_cost + n_hits2 * hit_cost = run_time(problem_2) - - init_cost of `horspool` for larger problems is negligible - Furthermore, it is not used from within as it has already happened - - 7. Run above for different problems. if results differ take averages - Compare with current calibration constants - - 8. It works well, but is not perfect. - See if you can come up with more accurate model. + + Main Inputs + s - haystack + p - needle + n - len(haystack) + m - len(needle) + + Bi-Directional Conventions: + stt - start index + end - end index + ss - pointer to last window index that matches last needle character + >>> dir_fwd, dir_rev = 1, -1 + >>> s = [0, 1, 2, 3, 4, 5] + >>> s_stt_fwd, s_stt_rev = 0, 5 + >>> s_end_fwd, s_end_rev = 5, 0 + >>> p = [0, 1] + >>> m = len(p) + >>> s = 0 + >>> ss_fwd = s + s_stt_fwd + dir_fwd * (m - 1) + >>> ss_rev = s + s_stt_rev + dir_rev * (m - 1) + >>> ss_fwd, ss_rev + (1, 4) + + There is one more important variable here: j_off + It brings ss in alignment with a needle. + So that it stands at the first absolute index of the window + + >>> i = 0 # first step + >>> p_stt_fwd, p_stt_rev = 0, 1 + >>> p_end_fwd, p_end_rev = 1, 0 + >>> j_off_fwd = dir_fwd * i - p_end_fwd + >>> ss_fwd + j_off_fwd + 0 + + such that [0, 1, 2, 3, 4, 5] + [0, 1] + * - both indices are at 0 here + + >>> j_off_rev = dir_rev * i - p_end_rev + >>> ss_rev + j_off_rev + 4 + + such that [0, 1, 2, 3, 4, 5] + [0, 1] + * - both indices are at 0 here + Finally, which side it iterates from is determined by: + jp = p_stt + (reversed ? -j : j); + + With this transformation the problem becomes direction agnostic + + Dynamic mode + 'Horspool' algorithm will switch to `two_way_find` if it predicts + that it can solve the problem faster. + + Calibration + The simple model for run time of search algorithm is as follows: + loop - actual loop that happens (not theoretical) + init_cost - initialization cost per 1 needle character in ns + loop_cost - cost of 1 main loop + hit_cost - cost of 1 false positive character check + avg_hit - average number of false positive hits per 1 loop + + >>> m = len(needle) + >>> run_time = m * m + n_loops * (loop_cost + hit_cost * avg_hit) + + Calibrate: + 1. expose function to run without handling special cases first. + 2. set dynamic = 0 + 3. Enable counter printing to know how many hits and loops + happened. see printf(iloop|ihits) at the end of the function + + 4. init_cost = run_time(horspool_find(s='', p='*' * m)) / m + + 5. `two_way` only has loop cost. + run_time(two_way_find(s='*' * 1000)) - init_cost + loop_cost = ------------------------------------------------ + n_loops (from stdout) + Note, iloop & ihits of `two_way` should be the same. + + 6. To get loop_cost and hit_cost of `horspool_find` solve + equation system representing 2 different runs + n_loops1 * loop_cost + n_hits1 * hit_cost = run_time(problem_1) + n_loops2 * loop_cost + n_hits2 * hit_cost = run_time(problem_2) + + init_cost of `horspool` for larger problems is negligible + Furthermore, it is not used as it has already happened + + 7. Run above for different problems. Take averages. + Compare with current calibration constants. + + 8. It works well, but is not perfect. + See if you can come up with more accurate model. */ if (mode == FAST_COUNT) { LOG("Horspool Count.\n"); @@ -705,7 +718,7 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, } LOG("haystack: "); LOG_STRING(s, n); LOG("\n"); LOG("needle : "); LOG_STRING(p, m); LOG("\n"); - int dir = direction < 0 ? -1 : 1; + dir = dir < 0 ? -1 : 1; // This could help compiler a bit int reversed = dir < 0; // Prepare @@ -728,10 +741,10 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, const STRINGLIB_CHAR *const ss = s + s_stt + dir_m_m1; const STRINGLIB_CHAR p_last = p[p_end]; - // Indexers + // Indexers (ip and jp are directional indices. e.g. ip = pos * i) Py_ssize_t i, j, ip, jp; - // Use Bloom for len(haystack) >= 10 * len(needle) + // Use Bloom for len(haystack) >= 30 * len(needle) unsigned long mask = 0; Py_ssize_t true_gap = 0; Py_ssize_t j_stop = m; @@ -777,12 +790,9 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, if (true_gap) { shift = 0; if (s_last != p_last) { - if (i < w && !STRINGLIB_BLOOM(mask, ss[ip+dir])) { - shift = m_p1; - } - else { - shift = Py_MAX(table[s_last & TABLE_MASK], 1); - } + shift = (i < w && !STRINGLIB_BLOOM(mask, ss[ip+dir])) + ? m_p1 + : Py_MAX(table[s_last & TABLE_MASK], 1); } } else { @@ -1109,7 +1119,6 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, } } int dynamic = 1; - int direction = mode != FAST_RSEARCH ? 1 : -1; - return STRINGLIB(horspool_find)(s, n, p, m, maxcount, mode, - direction, dynamic); + int dir = mode != FAST_RSEARCH ? 1 : -1; + return STRINGLIB(horspool_find)(s, n, p, m, maxcount, mode, dir, dynamic); } From f3671e1122b7056e4eb044889473a86196d57a2e Mon Sep 17 00:00:00 2001 From: "d.grigonis" Date: Sat, 8 Jun 2024 05:55:41 +0300 Subject: [PATCH 13/28] comments and calibration --- Objects/stringlib/fastsearch.h | 48 +++++++++++++++++----------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index cb45a3d69fff2f..7609432bbcada6 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -223,26 +223,25 @@ STRINGLIB(_lex_search)(const STRINGLIB_CHAR *p, Also find the period of the right half. Direction: dir : {-1, 1} - if dir == -1, then the problem is reverse + if dir == -1, then the problem is reverse In short: - _lex_search(x, -1) == _lex_search(x[::-1], 1) - - Returned cut is "the size of the cut towards chosen direction". - E.g.: - >>> x = '1234' - >>> cut, period = factorize(x, dir=1) # cut = 0 - >>> cut - 0 - >>> cut_idx = cut - >>> x[:cut_idx], x[cut_idx:] - '', '1234' - >>> x = '4321' - >>> cut, period = factorize(x, dir=-1) - >>> cut - 0 - >>> cut_idx = len(x) - cut - >>> x[:cut_idx], x[cut_idx:] - '4321', '' + _lex_search(x, -1) == _lex_search(x[::-1], 1) + + Returned cut is the size of the cut towards chosen direction. E.g.: + >>> x = '1234' + >>> cut, period = factorize(x, dir=1) # cut = 0 + >>> cut + 0 + >>> cut_idx = cut + >>> x[:cut_idx], x[cut_idx:] + '', '1234' + >>> x = '4321' + >>> cut, period = factorize(x, dir=-1) + >>> cut + 0 + >>> cut_idx = len(x) - cut + >>> x[:cut_idx], x[cut_idx:] + '4321', '' */ Py_ssize_t max_suffix = 0; Py_ssize_t candidate = 1; @@ -640,7 +639,7 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, >>> ss_fwd, ss_rev (1, 4) - There is one more important variable here: j_off + There is one more important variable: j_off It brings ss in alignment with a needle. So that it stands at the first absolute index of the window @@ -653,7 +652,7 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, such that [0, 1, 2, 3, 4, 5] [0, 1] - * - both indices are at 0 here + * - both indices are at 0 >>> j_off_rev = dir_rev * i - p_end_rev >>> ss_rev + j_off_rev @@ -661,9 +660,10 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, such that [0, 1, 2, 3, 4, 5] [0, 1] - * - both indices are at 0 here + * - both indices are at 0 Finally, which side it iterates from is determined by: jp = p_stt + (reversed ? -j : j); + , where j is an increasing needle-size counter in both cases With this transformation the problem becomes direction agnostic @@ -770,7 +770,7 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, const double hrs_lcost = 4.0; // average loop cost const double hrs_hcost = 0.4; // false positive hit cost // Two-Way Calibration - const double twy_icost = 3.0 * (double)m; // total initialization cost + const double twy_icost = 3.5 * (double)m; // total initialization cost const double twy_lcost = 3.0; // loop cost // Temporary double exp_hrs, exp_twy, ll; // expected run times & loops left @@ -1118,7 +1118,7 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, return res == 0 ? 0 : -1; } } - int dynamic = 1; + int dynamic = 1; // dynamic fallback to two-way algorithm flag int dir = mode != FAST_RSEARCH ? 1 : -1; return STRINGLIB(horspool_find)(s, n, p, m, maxcount, mode, dir, dynamic); } From 704cf89c9b02097773a9ac1c8c393328b3ebefb4 Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Sat, 8 Jun 2024 23:23:46 +0000 Subject: [PATCH 14/28] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20b?= =?UTF-8?q?lurb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../2024-06-08-23-23-45.gh-issue-119702.PYdAU2.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2024-06-08-23-23-45.gh-issue-119702.PYdAU2.rst diff --git a/Misc/NEWS.d/next/Core and Builtins/2024-06-08-23-23-45.gh-issue-119702.PYdAU2.rst b/Misc/NEWS.d/next/Core and Builtins/2024-06-08-23-23-45.gh-issue-119702.PYdAU2.rst new file mode 100644 index 00000000000000..2f84a9c51329cd --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2024-06-08-23-23-45.gh-issue-119702.PYdAU2.rst @@ -0,0 +1 @@ +Another round of optimization of string search algorithms has been done. In addition, they were adapted to reverse search (`rfind`). From dad1c9e32ceea7ea508bb6fa0a02fa4559b80aca Mon Sep 17 00:00:00 2001 From: "d.grigonis" Date: Sun, 9 Jun 2024 05:07:06 +0300 Subject: [PATCH 15/28] refactoring variable name changes and and cleanup --- Objects/stringlib/fastsearch.h | 555 ++++++++++++--------------------- 1 file changed, 192 insertions(+), 363 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 7609432bbcada6..24536e25618e2f 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -212,49 +212,51 @@ STRINGLIB(rfind_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch) Py_LOCAL_INLINE(Py_ssize_t) -STRINGLIB(_lex_search)(const STRINGLIB_CHAR *p, - Py_ssize_t m, +STRINGLIB(_lex_search)(const STRINGLIB_CHAR *needle, + Py_ssize_t needle_len, Py_ssize_t *return_period, int invert_alphabet, - int dir) + int reversed) { /* Do a lexicographic search. Essentially this: >>> max(needle[i:] for i in range(len(needle)+1)) Also find the period of the right half. Direction: - dir : {-1, 1} - if dir == -1, then the problem is reverse + reversed : {0, 1} + if reversed == 1, then the problem is reverse In short: - _lex_search(x, -1) == _lex_search(x[::-1], 1) + _lex_search(x, 1) == _lex_search(x[::-1], 0) - Returned cut is the size of the cut towards chosen direction. E.g.: + Returned cut is the size of the cut towards chosen reversed. E.g.: >>> x = '1234' - >>> cut, period = factorize(x, dir=1) # cut = 0 + >>> cut, period = factorize(x, reversed=0) # cut = 0 >>> cut 0 >>> cut_idx = cut >>> x[:cut_idx], x[cut_idx:] '', '1234' >>> x = '4321' - >>> cut, period = factorize(x, dir=-1) + >>> cut, period = factorize(x, reversed=1) >>> cut 0 >>> cut_idx = len(x) - cut >>> x[:cut_idx], x[cut_idx:] '4321', '' */ + int dir = reversed ? -1 : 1; + // starting position from chosen direction + Py_ssize_t stt = reversed ? m - 1 : 0; + Py_ssize_t m = needle_len; Py_ssize_t max_suffix = 0; Py_ssize_t candidate = 1; Py_ssize_t k = 0; // The period of the right half. Py_ssize_t period = 1; - // stt is starting position from chosen direction - Py_ssize_t stt = dir == 1 ? 0 : m - 1; STRINGLIB_CHAR a, b; while (candidate + k < m) { // each loop increases (in chosen direction) candidate + k + max_suffix - a = p[stt + dir*(candidate + k)]; - b = p[stt + dir*(max_suffix + k)]; + a = needle[stt + dir*(candidate + k)]; + b = needle[stt + dir*(max_suffix + k)]; // check if the suffix at candidate is better than max_suffix if (invert_alphabet ? (b < a) : (a < b)) { // Fell short of max_suffix. @@ -292,10 +294,10 @@ STRINGLIB(_lex_search)(const STRINGLIB_CHAR *p, Py_LOCAL_INLINE(Py_ssize_t) -STRINGLIB(_factorize)(const STRINGLIB_CHAR *p, - Py_ssize_t m, +STRINGLIB(_factorize)(const STRINGLIB_CHAR *needle, + Py_ssize_t needle_len, Py_ssize_t *return_period, - int dir) + int reversed) { /* Do a "critical factorization", making it so that: >>> needle = (left := needle[:cut]) + (right := needle[cut:]) @@ -330,15 +332,15 @@ STRINGLIB(_factorize)(const STRINGLIB_CHAR *p, period of the original string. Direction: - dir : {-1, 1} - if dir == -1, then the problem is reverse + reversed : {0, 1} + if reversed == 1, then the problem is reverse In short: - _factorize(x, -1) == _factorize(x[::-1], 1) + _factorize(x, 1) == _factorize(x[::-1], 0) See docstring of _lex_search if still unclear */ Py_ssize_t cut1, period1, cut2, period2, cut, period; - cut1 = STRINGLIB(_lex_search)(p, m, &period1, 0, dir); - cut2 = STRINGLIB(_lex_search)(p, m, &period2, 1, dir); + cut1 = STRINGLIB(_lex_search)(needle, needle_len, &period1, 0, reversed); + cut2 = STRINGLIB(_lex_search)(needle, needle_len, &period2, 1, reversed); // Take the later cut. if (cut1 > cut2) { period = period1; @@ -360,101 +362,133 @@ STRINGLIB(_factorize)(const STRINGLIB_CHAR *p, #define TABLE_SIZE (1U << TABLE_SIZE_BITS) #define TABLE_MASK (TABLE_SIZE - 1U) + typedef struct STRINGLIB(_pre) { - const STRINGLIB_CHAR *p; // needle - Py_ssize_t m; // length of the needle - Py_ssize_t cut; // Critical Factorization Cut + // Data & Direction + const STRINGLIB_CHAR *needle; // needle + Py_ssize_t needle_len; // length of the needle + int reversed; // Reverse Direction + // Containment mask and Full "Good Suffix" gap for last character + unsigned long mask; // Containment bloom type mask + Py_ssize_t true_gap; // Actual Last character gap + // "Bad Character" table and Filtered "Good Suffix" gap for last character + SHIFT_TYPE table[TABLE_SIZE]; // Boyer-Moore "Bad Character" table + Py_ssize_t gap; // Filterd Last Character Gap <= true_gap + // Critical Factorization + Py_ssize_t cut; // Critical Factorization Cut Length Py_ssize_t period; // Global Period of the string - Py_ssize_t gap; // "Good Suffix" Last Character Gap int is_periodic; - SHIFT_TYPE table[TABLE_SIZE]; // Boyer-Moore "Bad Character" table } STRINGLIB(prework); static void -STRINGLIB(_init_bc_table_gs_gap)(STRINGLIB(prework) *pw, int dir) +STRINGLIB(prepare_search)(STRINGLIB(prework) *pw, + int bloom_mask_and_true_gap, + int bc_table_and_gap, + int critical_factorization) { - // 1. Fill up a compressed Boyer-Moore "Bad Character" table - const STRINGLIB_CHAR *p = pw->p; - Py_ssize_t m = pw->m; - Py_ssize_t stt = dir == 1 ? 0 : m - 1; - Py_ssize_t end = dir == 1 ? m - 1 : 0; - Py_ssize_t not_found_shift = Py_MIN(m, MAX_SHIFT); - for (Py_ssize_t i = 0; i < (Py_ssize_t)TABLE_SIZE; i++) { - pw->table[i] = Py_SAFE_DOWNCAST(not_found_shift, - Py_ssize_t, SHIFT_TYPE); + const STRINGLIB_CHAR *p = pw->needle; + Py_ssize_t m = pw->needle_len; + int reversed = pw->reversed; + int dir = reversed ? -1 : 1; + Py_ssize_t stt = reversed ? m - 1 : 0; + Py_ssize_t end = reversed ? 0 : m - 1; + // 1. Containment mask and Full "Good Suffix" gap for last character + if (bloom_mask_and_true_gap) { + const STRINGLIB_CHAR p_last = p[end]; + pw->mask = 0; + pw->true_gap = m; + // Note: true_gap("___aa") = 1 + Py_ssize_t jp; + STRINGLIB_CHAR p_tmp; + STRINGLIB_BLOOM_ADD(pw->mask, p_last); + for (Py_ssize_t j = 1; j < m; j++) { + jp = end + (reversed ? j : -j); + p_tmp = p[jp]; + STRINGLIB_BLOOM_ADD(pw->mask, p_tmp); + if (pw->true_gap == m && p_tmp == p_last) { + pw->true_gap = j; + } + } + LOG("Good Suffix Full Gap: %ld\n", pw->true_gap); } - for (Py_ssize_t i = m - not_found_shift; i < m; i++) { - SHIFT_TYPE shift = Py_SAFE_DOWNCAST(m - 1 - i, + + if (bc_table_and_gap) { + // 2.1. Fill a compressed Boyer-Moore "Bad Character" table + Py_ssize_t not_found_shift = Py_MIN(m, MAX_SHIFT); + for (Py_ssize_t i = 0; i < (Py_ssize_t)TABLE_SIZE; i++) { + pw->table[i] = Py_SAFE_DOWNCAST(not_found_shift, Py_ssize_t, SHIFT_TYPE); - pw->table[p[stt + dir*i] & TABLE_MASK] = shift; - } - // 2. Initialize "Good Suffix" Last Character Gap - // Note: gap("___aa") = 1 - pw->gap = m; - STRINGLIB_CHAR last = p[end] & TABLE_MASK; - for (Py_ssize_t i = 1; i < m; i++) { - STRINGLIB_CHAR x = p[end - dir*i] & TABLE_MASK; - if (x == last) { - pw->gap = i; - break; } + for (Py_ssize_t i = m - not_found_shift; i < m; i++) { + SHIFT_TYPE shift = Py_SAFE_DOWNCAST(m - 1 - i, + Py_ssize_t, SHIFT_TYPE); + pw->table[p[stt + dir*i] & TABLE_MASK] = shift; + } + // 2.2. Initialize "Good Suffix" Last Character Gap + // Note: gap("___aa") = 1 + pw->gap = m; + STRINGLIB_CHAR last = p[end] & TABLE_MASK; + for (Py_ssize_t i = 1; i < m; i++) { + STRINGLIB_CHAR x = p[end - dir*i] & TABLE_MASK; + if (x == last) { + pw->gap = i; + break; + } + } + LOG("Good Suffix Partial Gap: %ld\n", pw->gap); } - LOG("Good Suffix Gap: %ld\n", pw->gap); -} - -static void -STRINGLIB(_init_critical_fac)(STRINGLIB(prework) *pw, int dir) -{ - // Calculate Critical Factorization - const STRINGLIB_CHAR *p = pw->p; - Py_ssize_t m = pw->m; - Py_ssize_t cut, period; - int is_periodic; - cut = STRINGLIB(_factorize)(p, m, &period, dir); - assert(cut + period <= m); - if (dir == 1) { - is_periodic = memcmp(p, p + period, cut * STRINGLIB_SIZEOF_CHAR) == 0; - } - else { - Py_ssize_t cut_idx = m - cut; - is_periodic = (memcmp(p + cut_idx, p + cut_idx - period, - cut * STRINGLIB_SIZEOF_CHAR) == 0); - } - if (is_periodic) { - assert(cut <= m/2); - assert(cut < period); - LOG("Needle is periodic.\n"); - } - else { - // A lower bound on the period - // CLARIFY> An upper bound? - period = Py_MAX(cut, m - cut) + 1; - LOG("Needle is not periodic.\n"); + // 3. Calculate Critical Factorization + if (critical_factorization) { + Py_ssize_t period; + Py_ssize_t cut = STRINGLIB(_factorize)(p, m, &period, reversed); + assert(cut + period <= m); + int cmp; + if (reversed) { + Py_ssize_t cut_idx = m - cut; + cmp = memcmp(p + cut_idx, p + cut_idx - period, + cut * STRINGLIB_SIZEOF_CHAR); + } + else { + cmp = memcmp(p, p + period, cut * STRINGLIB_SIZEOF_CHAR); + } + int is_periodic = cmp == 0; + if (is_periodic) { + assert(cut <= m/2); + assert(cut < period); + LOG("Needle is periodic.\n"); + } + else { + // A lower bound on the period + // CLARIFY> An upper bound? + period = Py_MAX(cut, m - cut) + 1; + LOG("Needle is not periodic.\n"); + } + pw->cut = cut; + pw->period = period; + pw->is_periodic = is_periodic; + + LOG("Cut: %ld & Period: %ld\n", cut, period); + LOG("split: "); + LOG_STRING(p, cut); + LOG(" + "); + LOG_STRING(p + cut, m - cut); + LOG("\n"); } - pw->cut = cut; - pw->period = period; - pw->is_periodic = is_periodic; - - LOG("Cut: %ld & Period: %ld\n", cut, period); - LOG("split: "); - LOG_STRING(p, cut); - LOG(" + "); - LOG_STRING(p + cut, m - cut); - LOG("\n"); } static Py_ssize_t -STRINGLIB(_two_way)(const STRINGLIB_CHAR *s, Py_ssize_t n, - Py_ssize_t maxcount, int mode, - STRINGLIB(prework) *pw, int dir) +STRINGLIB(two_way_find)(const STRINGLIB_CHAR *haystack, + const Py_ssize_t haystack_len, + int find_mode, Py_ssize_t maxcount, + STRINGLIB(prework) *pw) { /* Crochemore and Perrin's (1991) Two-Way algorithm. See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 - Main Inputs + Variable Names s - haystack p - needle n - len(haystack) @@ -466,20 +500,19 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *s, Py_ssize_t n, Critical factorization reversion: See docstring of _factorize */ - if (mode == FAST_COUNT) { - LOG("Two-way Count.\n"); - } - else { - LOG("Two-way Find.\n"); - } + LOG(find_mode ? "Two-way Find.\n" : "Two-way Count.\n"); + + // Collect Data + int reversed = pw->reversed; + int dir = reversed ? -1 : 1; + const STRINGLIB_CHAR *const p = pw->needle; + const STRINGLIB_CHAR *const s = haystack; + const Py_ssize_t m = pw->needle_len; + const Py_ssize_t n = haystack_len; LOG("haystack: "); LOG_STRING(s, n); LOG("\n"); - LOG("needle : "); LOG_STRING(pw->p, pw->m); LOG("\n"); - dir = dir < 0 ? -1 : 1; // This could help compiler a bit - int reversed = dir < 0; + LOG("needle : "); LOG_STRING(p, m); LOG("\n"); - // Prepare - const STRINGLIB_CHAR *const p = pw->p; - const Py_ssize_t m = pw->m; + // Retrieve Preparation SHIFT_TYPE *table = pw->table; const Py_ssize_t gap = pw->gap; const Py_ssize_t cut = pw->cut; @@ -574,8 +607,8 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *s, Py_ssize_t n, } if (j == cut) { LOG("Found a match!\n"); - if (mode != FAST_COUNT) { - return reversed ? n - m - i : i; + if (find_mode) { + return reversed ? w - i : i; } if (++count == maxcount) { return maxcount; @@ -590,34 +623,20 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *s, Py_ssize_t n, // iloop == ihits indicates linear performance for quadratic problems LOG("iloop: %ld\n", iloop); LOG("ihits: %ld\n", ihits); - return mode == FAST_COUNT ? count : -1; + return find_mode ? -1 : count; } static Py_ssize_t -STRINGLIB(two_way_find)(const STRINGLIB_CHAR *s, Py_ssize_t n, - const STRINGLIB_CHAR *p, Py_ssize_t m, - Py_ssize_t maxcount, int mode, int dir) -{ - STRINGLIB(prework) pw; - (&pw)->p = p; - (&pw)->m = m; - STRINGLIB(_init_bc_table_gs_gap)(&pw, dir); - STRINGLIB(_init_critical_fac)(&pw, dir); - return STRINGLIB(_two_way)(s, n, maxcount, mode, &pw, dir); -} - - -static Py_ssize_t -STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, - const STRINGLIB_CHAR* p, Py_ssize_t m, - Py_ssize_t maxcount, int mode, - int dir, int dynamic) +STRINGLIB(horspool_find)(const STRINGLIB_CHAR* haystack, + const Py_ssize_t haystack_len, + int find_mode, Py_ssize_t maxcount, + int dynamic, STRINGLIB(prework) *pw) { /* Boyer–Moore–Horspool algorithm with optional dynamic fallback to Two-Way algorithm - Main Inputs + Variable Names s - haystack p - needle n - len(haystack) @@ -710,29 +729,30 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, 8. It works well, but is not perfect. See if you can come up with more accurate model. */ - if (mode == FAST_COUNT) { - LOG("Horspool Count.\n"); - } - else { - LOG("Horspool Find\n"); - } + LOG(find_mode ? "Horspool Find\n" : "Horspool Count.\n"); + + // Collect Data + int reversed = pw->reversed; + int dir = reversed ? -1 : 1; + const STRINGLIB_CHAR *const p = pw->needle; + const STRINGLIB_CHAR *const s = haystack; + const Py_ssize_t m = pw->needle_len; + const Py_ssize_t n = haystack_len; LOG("haystack: "); LOG_STRING(s, n); LOG("\n"); LOG("needle : "); LOG_STRING(p, m); LOG("\n"); - dir = dir < 0 ? -1 : 1; // This could help compiler a bit - int reversed = dir < 0; - - // Prepare - STRINGLIB(prework) pw; - (&pw)->p = p; - (&pw)->m = m; - STRINGLIB(_init_bc_table_gs_gap)(&pw, dir); - Py_ssize_t gap = (&pw)->gap; - SHIFT_TYPE *table = (&pw)->table; // Direction Independent const Py_ssize_t m_m1 = m - 1; const Py_ssize_t m_p1 = m + 1; const Py_ssize_t w = n - m; + + // Retrieve Preparation + Py_ssize_t true_gap = pw->true_gap; + unsigned long mask = pw->mask; + Py_ssize_t j_stop = true_gap ? m_m1 : m; + Py_ssize_t gap = pw->gap; + SHIFT_TYPE *table = pw->table; + // Direction Dependent const Py_ssize_t s_stt = dir == 1 ? 0 : n - 1; const Py_ssize_t p_stt = dir == 1 ? 0 : m - 1; @@ -744,28 +764,6 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, // Indexers (ip and jp are directional indices. e.g. ip = pos * i) Py_ssize_t i, j, ip, jp; - // Use Bloom for len(haystack) >= 30 * len(needle) - unsigned long mask = 0; - Py_ssize_t true_gap = 0; - Py_ssize_t j_stop = m; - if (n >= 30 * m) { - j_stop = m_m1; - true_gap = m; - // Note: true_gap("___aa") = 1 - STRINGLIB_CHAR p_tmp; - STRINGLIB_BLOOM_ADD(mask, p_last); - for (j = 1; j < m; j++) { - jp = p_end + (reversed ? j : -j); - p_tmp = p[jp]; - STRINGLIB_BLOOM_ADD(mask, p_tmp); - if (true_gap == m && p_tmp == p_last) { - true_gap = j; - } - } - LOG("Good Suffix True Gap: %ld\n", true_gap); - } - - // Total cost of two-way initialization // Horspool Calibration const double hrs_lcost = 4.0; // average loop cost const double hrs_hcost = 0.4; // false positive hit cost @@ -790,9 +788,13 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, if (true_gap) { shift = 0; if (s_last != p_last) { - shift = (i < w && !STRINGLIB_BLOOM(mask, ss[ip+dir])) - ? m_p1 - : Py_MAX(table[s_last & TABLE_MASK], 1); + // full skip: check if next character is part of pattern + if (i < w && !STRINGLIB_BLOOM(mask, ss[ip+dir])) { + shift = m_p1; + } + else { + shift = Py_MAX(table[s_last & TABLE_MASK], 1); + } } } else { @@ -817,8 +819,8 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, } if (j == j_stop) { LOG("Found a match!\n"); - if (mode != FAST_COUNT) { - return reversed ? n - m - i : i; + if (find_mode) { + return reversed ? w - i : i; } if (++count == maxcount) { return maxcount; @@ -826,6 +828,7 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, i += m; } else if (true_gap) { + // full skip: check if next character is part of pattern if (i < w && !STRINGLIB_BLOOM(mask, ss[ip+dir])) { LOG("Move by (m + 1) = %ld\n", m_p1); i += m_p1; @@ -850,11 +853,11 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, exp_twy = twy_icost + ll * twy_lcost; if (exp_twy < exp_hrs) { LOG("switching to two-way algorithm: n=%ld, m=%ld\n", n, m); - STRINGLIB(_init_critical_fac)(&pw, dir); - Py_ssize_t res = STRINGLIB(_two_way)( + STRINGLIB(prepare_search)(pw, 0, 0, 1); + Py_ssize_t res = STRINGLIB(two_way_find)( reversed ? s : s + i, n - i, - maxcount - count, mode, &pw, dir); - if (mode == FAST_SEARCH) { + find_mode, maxcount - count, pw); + if (find_mode) { return res == -1 ? -1 : (reversed ? res : res + i); } else { @@ -870,7 +873,7 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, // Used for calibration and fallback decision LOG("iloop: %ld\n", iloop); LOG("ihits: %ld\n", ihits); - return mode == FAST_COUNT ? count : -1; + return find_mode ? -1 : count; } #undef SHIFT_TYPE @@ -886,190 +889,6 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, #undef LOG_LEVEL -static inline Py_ssize_t -STRINGLIB(default_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, - const STRINGLIB_CHAR* p, Py_ssize_t m, - Py_ssize_t maxcount, int mode) -{ - const Py_ssize_t w = n - m; - Py_ssize_t mlast = m - 1, count = 0; - const STRINGLIB_CHAR last = p[mlast]; - const STRINGLIB_CHAR *const ss = &s[mlast]; - - // Initialize Bloom - // Note: gap("___aa") = 0 - Py_ssize_t gap = mlast; - unsigned long mask = 0; - STRINGLIB_BLOOM_ADD(mask, last); - for (Py_ssize_t i = 0; i < mlast; i++) { - STRINGLIB_BLOOM_ADD(mask, p[i]); - if (p[i] == last) { - gap = mlast - i - 1; - } - } - - for (Py_ssize_t i = 0; i <= w; i++) { - if (ss[i] == last) { - /* candidate match */ - Py_ssize_t j; - for (j = 0; j < mlast; j++) { - if (s[i+j] != p[j]) { - break; - } - } - if (j == mlast) { - /* got a match! */ - if (mode != FAST_COUNT) { - return i; - } - count++; - if (count == maxcount) { - return maxcount; - } - i = i + mlast; - continue; - } - /* miss: check if next character is part of pattern */ - if (!STRINGLIB_BLOOM(mask, ss[i+1])) { - i = i + m; - } - else { - i = i + gap; - } - } - else { - /* skip: check if next character is part of pattern */ - if (!STRINGLIB_BLOOM(mask, ss[i+1])) { - i = i + m; - } - } - } - return mode == FAST_COUNT ? count : -1; -} - - -static Py_ssize_t -STRINGLIB(adaptive_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, - const STRINGLIB_CHAR* p, Py_ssize_t m, - Py_ssize_t maxcount, int mode) -{ - const Py_ssize_t w = n - m; - Py_ssize_t mlast = m - 1, count = 0; - Py_ssize_t gap = mlast; - Py_ssize_t hits = 0, res; - const STRINGLIB_CHAR last = p[mlast]; - const STRINGLIB_CHAR *const ss = &s[mlast]; - - unsigned long mask = 0; - for (Py_ssize_t i = 0; i < mlast; i++) { - STRINGLIB_BLOOM_ADD(mask, p[i]); - if (p[i] == last) { - gap = mlast - i - 1; - } - } - STRINGLIB_BLOOM_ADD(mask, last); - - for (Py_ssize_t i = 0; i <= w; i++) { - if (ss[i] == last) { - /* candidate match */ - Py_ssize_t j; - for (j = 0; j < mlast; j++) { - if (s[i+j] != p[j]) { - break; - } - } - if (j == mlast) { - /* got a match! */ - if (mode != FAST_COUNT) { - return i; - } - count++; - if (count == maxcount) { - return maxcount; - } - i = i + mlast; - continue; - } - hits += j + 1; - if (hits > m / 4 && w - i > 2000) { - res = STRINGLIB(two_way_find)( - s + i, n - i, p, m, maxcount, mode, 1); - if (mode == FAST_SEARCH) { - return res == -1 ? -1 : res + i; - } - else { - return res + count; - } - } - /* miss: check if next character is part of pattern */ - if (!STRINGLIB_BLOOM(mask, ss[i+1])) { - i = i + m; - } - else { - i = i + gap; - } - } - else { - /* skip: check if next character is part of pattern */ - if (!STRINGLIB_BLOOM(mask, ss[i+1])) { - i = i + m; - } - } - } - return mode == FAST_COUNT ? count : -1; -} - - -static Py_ssize_t -STRINGLIB(default_rfind)(const STRINGLIB_CHAR* s, Py_ssize_t n, - const STRINGLIB_CHAR* p, Py_ssize_t m, - Py_ssize_t maxcount, int mode) -{ - /* create compressed boyer-moore delta 1 table */ - unsigned long mask = 0; - Py_ssize_t i, j, mlast = m - 1, skip = m - 1, w = n - m; - - /* process pattern[0] outside the loop */ - STRINGLIB_BLOOM_ADD(mask, p[0]); - /* process pattern[:0:-1] */ - for (i = mlast; i > 0; i--) { - STRINGLIB_BLOOM_ADD(mask, p[i]); - if (p[i] == p[0]) { - skip = i - 1; - } - } - - for (i = w; i >= 0; i--) { - if (s[i] == p[0]) { - /* candidate match */ - for (j = mlast; j > 0; j--) { - if (s[i+j] != p[j]) { - break; - } - } - if (j == 0) { - /* got a match! */ - return i; - } - /* miss: check if previous character is part of pattern */ - if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1])) { - i = i - m; - } - else { - i = i - skip; - } - } - else { - /* skip: check if previous character is part of pattern */ - if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1])) { - i = i - m; - } - } - } - return -1; -} - - static inline Py_ssize_t STRINGLIB(count_char)(const STRINGLIB_CHAR *s, Py_ssize_t n, const STRINGLIB_CHAR p0, Py_ssize_t maxcount) @@ -1118,7 +937,17 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, return res == 0 ? 0 : -1; } } - int dynamic = 1; // dynamic fallback to two-way algorithm flag - int dir = mode != FAST_RSEARCH ? 1 : -1; - return STRINGLIB(horspool_find)(s, n, p, m, maxcount, mode, dir, dynamic); + + STRINGLIB(prework) pw = { + .needle = p, + .needle_len = m, + .reversed = mode == FAST_RSEARCH, + .true_gap = 0, + }; + // Use Bloom for len(haystack) >= 30 * len(needle) + int bloom_mask_and_true_gap = n >= 30 * m; + STRINGLIB(prepare_search)(&pw, bloom_mask_and_true_gap, 1, 0); + int find_mode = mode != FAST_COUNT; + int dynamic = 1; + return STRINGLIB(horspool_find)(s, n, find_mode, maxcount, dynamic, &pw); } From 1a13f3a7950d6f6e683175ac8339121f46e67eba Mon Sep 17 00:00:00 2001 From: "d.grigonis" Date: Sun, 9 Jun 2024 05:12:18 +0300 Subject: [PATCH 16/28] bug --- Objects/stringlib/fastsearch.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 24536e25618e2f..747f4fff7ef883 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -245,15 +245,14 @@ STRINGLIB(_lex_search)(const STRINGLIB_CHAR *needle, */ int dir = reversed ? -1 : 1; // starting position from chosen direction - Py_ssize_t stt = reversed ? m - 1 : 0; - Py_ssize_t m = needle_len; + Py_ssize_t stt = reversed ? needle_len - 1 : 0; Py_ssize_t max_suffix = 0; Py_ssize_t candidate = 1; Py_ssize_t k = 0; // The period of the right half. Py_ssize_t period = 1; STRINGLIB_CHAR a, b; - while (candidate + k < m) { + while (candidate + k < needle_len) { // each loop increases (in chosen direction) candidate + k + max_suffix a = needle[stt + dir*(candidate + k)]; b = needle[stt + dir*(max_suffix + k)]; From 5aca5d0ab287ddc67e995be7a42e76b76b82ae52 Mon Sep 17 00:00:00 2001 From: "d.grigonis" Date: Sun, 9 Jun 2024 05:17:15 +0300 Subject: [PATCH 17/28] lint fix and comment --- .../2024-06-08-23-23-45.gh-issue-119702.PYdAU2.rst | 2 +- Objects/stringlib/fastsearch.h | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/Misc/NEWS.d/next/Core and Builtins/2024-06-08-23-23-45.gh-issue-119702.PYdAU2.rst b/Misc/NEWS.d/next/Core and Builtins/2024-06-08-23-23-45.gh-issue-119702.PYdAU2.rst index 2f84a9c51329cd..bb9fbd5ab62088 100644 --- a/Misc/NEWS.d/next/Core and Builtins/2024-06-08-23-23-45.gh-issue-119702.PYdAU2.rst +++ b/Misc/NEWS.d/next/Core and Builtins/2024-06-08-23-23-45.gh-issue-119702.PYdAU2.rst @@ -1 +1 @@ -Another round of optimization of string search algorithms has been done. In addition, they were adapted to reverse search (`rfind`). +Another round of optimization of string search algorithms has been done. In addition, they were adapted to reverse search (``str.rfind``). diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 747f4fff7ef883..ae2f65108a89fe 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -459,8 +459,7 @@ STRINGLIB(prepare_search)(STRINGLIB(prework) *pw, LOG("Needle is periodic.\n"); } else { - // A lower bound on the period - // CLARIFY> An upper bound? + // Upper bound on the period period = Py_MAX(cut, m - cut) + 1; LOG("Needle is not periodic.\n"); } From 6fcf44e4fe5561a9c4420a6829ac35a191e4b3e4 Mon Sep 17 00:00:00 2001 From: "d.grigonis" Date: Mon, 10 Jun 2024 03:28:12 +0300 Subject: [PATCH 18/28] news edit --- .../2024-06-08-23-23-45.gh-issue-119702.PYdAU2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Core and Builtins/2024-06-08-23-23-45.gh-issue-119702.PYdAU2.rst b/Misc/NEWS.d/next/Core and Builtins/2024-06-08-23-23-45.gh-issue-119702.PYdAU2.rst index bb9fbd5ab62088..7896d8ecbc738b 100644 --- a/Misc/NEWS.d/next/Core and Builtins/2024-06-08-23-23-45.gh-issue-119702.PYdAU2.rst +++ b/Misc/NEWS.d/next/Core and Builtins/2024-06-08-23-23-45.gh-issue-119702.PYdAU2.rst @@ -1 +1 @@ -Another round of optimization of string search algorithms has been done. In addition, they were adapted to reverse search (``str.rfind``). +Optimization of string search algorithms has been done. In addition, efficient solutions were also adapted to reverse search (``str.rfind``). From 4e5247274f3c4b7d754282e927ad38b4d7cc7a4a Mon Sep 17 00:00:00 2001 From: "d.grigonis" Date: Fri, 14 Jun 2024 09:17:33 +0300 Subject: [PATCH 19/28] docstring fixes and updates --- Objects/stringlib/fastsearch.h | 41 +++++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index ae2f65108a89fe..45e1ba725c46fa 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -386,6 +386,15 @@ STRINGLIB(prepare_search)(STRINGLIB(prework) *pw, int bc_table_and_gap, int critical_factorization) { + /* + This function prepares different search algorithm methods + + horspool_find: bc_table_and_gap = 1 + bloom_mask_and_true_gap = 1 (optional) + + two_way_find: bc_table_and_gap = 1 + critical_factorization = 1 + */ const STRINGLIB_CHAR *p = pw->needle; Py_ssize_t m = pw->needle_len; int reversed = pw->reversed; @@ -486,17 +495,21 @@ STRINGLIB(two_way_find)(const STRINGLIB_CHAR *haystack, /* Crochemore and Perrin's (1991) Two-Way algorithm. See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 - Variable Names - s - haystack - p - needle - n - len(haystack) - m - len(needle) + Initialization + pw needs to be initialized using prepare_search with + bc_table_and_gap = 1 and critical_factorization = 1 + + Variable Names + s - haystack + p - needle + n - len(haystack) + m - len(needle) - Bi-Directional Conventions: - See docstring of horspool_find + Bi-Directional Conventions: + See docstring of horspool_find - Critical factorization reversion: - See docstring of _factorize + Critical factorization reversion: + See docstring of _factorize */ LOG(find_mode ? "Two-way Find.\n" : "Two-way Count.\n"); @@ -634,6 +647,10 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* haystack, /* Boyer–Moore–Horspool algorithm with optional dynamic fallback to Two-Way algorithm + Initialization + pw needs to be initialized using prepare_search with + bc_table_and_gap = 1 and optionally bloom_mask_and_true_gap = 1 + Variable Names s - haystack p - needle @@ -696,8 +713,10 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* haystack, hit_cost - cost of 1 false positive character check avg_hit - average number of false positive hits per 1 loop - >>> m = len(needle) - >>> run_time = m * m + n_loops * (loop_cost + hit_cost * avg_hit) + m = len(needle) + run_time = init_cost * m + n_loops * (loop_cost + hit_cost * avg_hit) + + Note, n_loops * avg_hit is what causes quadratic complexity. Calibrate: 1. expose function to run without handling special cases first. From ea0fee6485a0e6a6b06f603492653663b3ea1501 Mon Sep 17 00:00:00 2001 From: "d.grigonis" Date: Fri, 14 Jun 2024 09:22:18 +0300 Subject: [PATCH 20/28] clearer blurp --- .../2024-06-08-23-23-45.gh-issue-119702.PYdAU2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Core and Builtins/2024-06-08-23-23-45.gh-issue-119702.PYdAU2.rst b/Misc/NEWS.d/next/Core and Builtins/2024-06-08-23-23-45.gh-issue-119702.PYdAU2.rst index 7896d8ecbc738b..9e90d255064adf 100644 --- a/Misc/NEWS.d/next/Core and Builtins/2024-06-08-23-23-45.gh-issue-119702.PYdAU2.rst +++ b/Misc/NEWS.d/next/Core and Builtins/2024-06-08-23-23-45.gh-issue-119702.PYdAU2.rst @@ -1 +1 @@ -Optimization of string search algorithms has been done. In addition, efficient solutions were also adapted to reverse search (``str.rfind``). +Optimization of string search algorithms. Performance optimizations extended to reverse search (``str.rfind``). From 7c20ca7d306e12300e95b65bbf26240b198afe66 Mon Sep 17 00:00:00 2001 From: "d.grigonis" Date: Thu, 20 Jun 2024 21:44:46 +0300 Subject: [PATCH 21/28] minus one bloom op --- Objects/stringlib/fastsearch.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 45e1ba725c46fa..a58a308bc3b58a 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -27,18 +27,21 @@ #if LONG_BIT >= 128 #define STRINGLIB_BLOOM_WIDTH 128 +#define STRINGLIB_BLOOM_FULL 127 #elif LONG_BIT >= 64 #define STRINGLIB_BLOOM_WIDTH 64 +#define STRINGLIB_BLOOM_FULL 63 #elif LONG_BIT >= 32 #define STRINGLIB_BLOOM_WIDTH 32 +#define STRINGLIB_BLOOM_FULL 31 #else #error "LONG_BIT is smaller than 32" #endif #define STRINGLIB_BLOOM_ADD(mask, ch) \ - ((mask |= (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1))))) + ((mask |= (1UL << ((ch) & (STRINGLIB_BLOOM_FULL))))) #define STRINGLIB_BLOOM(mask, ch) \ - ((mask & (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1))))) + ((mask & (1UL << ((ch) & (STRINGLIB_BLOOM_FULL))))) #ifdef STRINGLIB_FAST_MEMCHR # define MEMCHR_CUT_OFF 15 From c2859c0c733c370fbdebb018b28b8ce212acadd1 Mon Sep 17 00:00:00 2001 From: "d.grigonis" Date: Fri, 21 Jun 2024 07:48:48 +0300 Subject: [PATCH 22/28] table size 64 -> 128 --- Objects/stringlib/fastsearch.h | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index a58a308bc3b58a..71be6ffe2e4960 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -26,22 +26,19 @@ #define FAST_RSEARCH 2 #if LONG_BIT >= 128 -#define STRINGLIB_BLOOM_WIDTH 128 -#define STRINGLIB_BLOOM_FULL 127 +# define STRINGLIB_BLOOM_MASK 127 #elif LONG_BIT >= 64 -#define STRINGLIB_BLOOM_WIDTH 64 -#define STRINGLIB_BLOOM_FULL 63 +# define STRINGLIB_BLOOM_MASK 63 #elif LONG_BIT >= 32 -#define STRINGLIB_BLOOM_WIDTH 32 -#define STRINGLIB_BLOOM_FULL 31 +# define STRINGLIB_BLOOM_MASK 31 #else -#error "LONG_BIT is smaller than 32" +# error "LONG_BIT is smaller than 32" #endif #define STRINGLIB_BLOOM_ADD(mask, ch) \ - ((mask |= (1UL << ((ch) & (STRINGLIB_BLOOM_FULL))))) + ((mask |= (1UL << ((ch) & (STRINGLIB_BLOOM_MASK))))) #define STRINGLIB_BLOOM(mask, ch) \ - ((mask & (1UL << ((ch) & (STRINGLIB_BLOOM_FULL))))) + ((mask & (1UL << ((ch) & (STRINGLIB_BLOOM_MASK))))) #ifdef STRINGLIB_FAST_MEMCHR # define MEMCHR_CUT_OFF 15 @@ -360,9 +357,8 @@ STRINGLIB(_factorize)(const STRINGLIB_CHAR *needle, #define SHIFT_TYPE uint8_t #define MAX_SHIFT UINT8_MAX -#define TABLE_SIZE_BITS 6u -#define TABLE_SIZE (1U << TABLE_SIZE_BITS) -#define TABLE_MASK (TABLE_SIZE - 1U) +#define TABLE_SIZE 128U +#define TABLE_MASK 127U typedef struct STRINGLIB(_pre) { @@ -899,7 +895,6 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* haystack, #undef SHIFT_TYPE #undef NOT_FOUND #undef SHIFT_OVERFLOW -#undef TABLE_SIZE_BITS #undef TABLE_SIZE #undef TABLE_MASK From f34198948993e767acb1d52709800aa3cd3bdc85 Mon Sep 17 00:00:00 2001 From: "d.grigonis" Date: Fri, 21 Jun 2024 08:16:23 +0300 Subject: [PATCH 23/28] moved bloom setup closer to its logic --- Objects/stringlib/fastsearch.h | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 71be6ffe2e4960..812f0472be78c8 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -25,21 +25,6 @@ #define FAST_SEARCH 1 #define FAST_RSEARCH 2 -#if LONG_BIT >= 128 -# define STRINGLIB_BLOOM_MASK 127 -#elif LONG_BIT >= 64 -# define STRINGLIB_BLOOM_MASK 63 -#elif LONG_BIT >= 32 -# define STRINGLIB_BLOOM_MASK 31 -#else -# error "LONG_BIT is smaller than 32" -#endif - -#define STRINGLIB_BLOOM_ADD(mask, ch) \ - ((mask |= (1UL << ((ch) & (STRINGLIB_BLOOM_MASK))))) -#define STRINGLIB_BLOOM(mask, ch) \ - ((mask & (1UL << ((ch) & (STRINGLIB_BLOOM_MASK))))) - #ifdef STRINGLIB_FAST_MEMCHR # define MEMCHR_CUT_OFF 15 #else @@ -354,6 +339,23 @@ STRINGLIB(_factorize)(const STRINGLIB_CHAR *needle, } +// Bloom Setup +#if LONG_BIT >= 128 +# define STRINGLIB_BLOOM_MASK 127 +#elif LONG_BIT >= 64 +# define STRINGLIB_BLOOM_MASK 63 +#elif LONG_BIT >= 32 +# define STRINGLIB_BLOOM_MASK 31 +#else +# error "LONG_BIT is smaller than 32" +#endif + +#define STRINGLIB_BLOOM_ADD(mask, ch) \ + ((mask |= (1UL << ((ch) & (STRINGLIB_BLOOM_MASK))))) +#define STRINGLIB_BLOOM(mask, ch) \ + ((mask & (1UL << ((ch) & (STRINGLIB_BLOOM_MASK))))) + +// Boyer-Moore "Bad Character" table Setup #define SHIFT_TYPE uint8_t #define MAX_SHIFT UINT8_MAX From 2437983abbfa2f857fbd3fabf9e7aba379ac42b8 Mon Sep 17 00:00:00 2001 From: "d.grigonis" Date: Sat, 22 Jun 2024 16:24:16 +0300 Subject: [PATCH 24/28] micro simplifications and optimizations --- Objects/stringlib/fastsearch.h | 79 ++++++++++++++++------------------ 1 file changed, 36 insertions(+), 43 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 812f0472be78c8..ac7ef3bbc8c4b2 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -744,8 +744,8 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* haystack, 7. Run above for different problems. Take averages. Compare with current calibration constants. - 8. It works well, but is not perfect. - See if you can come up with more accurate model. + 8. Current calibration works well, but is not perfect. + Maybe you can come up with more accurate model. */ LOG(find_mode ? "Horspool Find\n" : "Horspool Count.\n"); @@ -765,16 +765,17 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* haystack, const Py_ssize_t w = n - m; // Retrieve Preparation - Py_ssize_t true_gap = pw->true_gap; + int bloom_on = pw->true_gap != 0; unsigned long mask = pw->mask; - Py_ssize_t j_stop = true_gap ? m_m1 : m; - Py_ssize_t gap = pw->gap; + // If bloom_on last character always matches so no need to check it + Py_ssize_t j_stop = bloom_on ? m_m1 : m; + Py_ssize_t gap = bloom_on ? pw->true_gap : pw->gap; SHIFT_TYPE *table = pw->table; // Direction Dependent - const Py_ssize_t s_stt = dir == 1 ? 0 : n - 1; - const Py_ssize_t p_stt = dir == 1 ? 0 : m - 1; - const Py_ssize_t p_end = dir == 1 ? m - 1 : 0; + const Py_ssize_t s_stt = reversed ? n - 1 : 0; + const Py_ssize_t p_stt = reversed ? m - 1 : 0; + const Py_ssize_t p_end = reversed ? 0 : m - 1; const Py_ssize_t dir_m_m1 = reversed ? -m_m1 : m_m1; const STRINGLIB_CHAR *const ss = s + s_stt + dir_m_m1; const STRINGLIB_CHAR p_last = p[p_end]; @@ -795,46 +796,47 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* haystack, STRINGLIB_CHAR s_last; // Counters Py_ssize_t count = 0; - Py_ssize_t iloop = 0, ihits_last = 0; - Py_ssize_t ihits = 0, iloop_last = 0; + Py_ssize_t iloop = 0, iloop_last = 0; + Py_ssize_t ihits = 0, ihits_last = 0; // Loop for (i = 0; i <= w;) { iloop++; ip = reversed ? -i : i; s_last = ss[ip]; LOG2("Last window ch: %c\n", s_last); - if (true_gap) { + if (bloom_on) { shift = 0; if (s_last != p_last) { - // full skip: check if next character is part of pattern - if (i < w && !STRINGLIB_BLOOM(mask, ss[ip+dir])) { - shift = m_p1; - } - else { - shift = Py_MAX(table[s_last & TABLE_MASK], 1); - } + shift = i < w && !STRINGLIB_BLOOM(mask, ss[ip+dir]) ? + m_p1 : Py_MAX(table[s_last & TABLE_MASK], 1); } + if (shift) { + LOG("Shift: %ld\n", shift); + i += shift; + continue; + } + assert(s_last == p_last); } else { shift = table[s_last & TABLE_MASK]; + if (shift) { + LOG("Shift: %ld\n", shift); + i += shift; + continue; + } + assert((s_last & TABLE_MASK) == (p_last & TABLE_MASK)); } - if (shift) { - LOG("Shift: %ld\n", shift); - i += shift; - continue; - } - - // assert(s_last == p_last); // true_gap - // assert((s_last & TABLE_MASK) == (p_last & TABLE_MASK)); // else j_off = ip - p_end; + jp = p_stt; for (j = 0; j < j_stop; j++) { ihits++; - jp = p_stt + (reversed ? -j : j); LOG2("Checking j=%ld: %c vs %c\n", j, ss[j_off + jp], p[jp]); if (ss[j_off + jp] != p[jp]) { break; } + jp += dir; } + if (j == j_stop) { LOG("Found a match!\n"); if (find_mode) { @@ -845,26 +847,17 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* haystack, } i += m; } - else if (true_gap) { + else if (bloom_on && i < w && !STRINGLIB_BLOOM(mask, ss[ip+dir])) { // full skip: check if next character is part of pattern - if (i < w && !STRINGLIB_BLOOM(mask, ss[ip+dir])) { - LOG("Move by (m + 1) = %ld\n", m_p1); - i += m_p1; - } - else { - LOG("Move by true gap = %ld\n", gap); - i += true_gap; - } + LOG("Move by (m + 1) = %ld\n", m_p1); + i += m_p1; } else { - LOG("Move by table gap = %ld\n", gap); + LOG("Move by gap = %ld\n", gap); i += gap; } - if (dynamic) { - if (ihits - ihits_last < 100 && iloop - iloop_last < 100) { - continue; - } + if (dynamic && ihits - ihits_last >= 100) { ll = (double)(w - i + 1); exp_hrs = ((double)iloop * hrs_lcost + (double)ihits * hrs_hcost) / (double)i * ll; @@ -894,9 +887,9 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* haystack, return find_mode ? -1 : count; } +#undef STRINGLIB_BLOOM_MASK + #undef SHIFT_TYPE -#undef NOT_FOUND -#undef SHIFT_OVERFLOW #undef TABLE_SIZE #undef TABLE_MASK From ea0f3628f97f41bf99d12790e5d82366d9da5a4a Mon Sep 17 00:00:00 2001 From: "d.grigonis" Date: Tue, 25 Jun 2024 18:38:47 +0300 Subject: [PATCH 25/28] remove unused variable --- Objects/stringlib/fastsearch.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index ac7ef3bbc8c4b2..8c5090b6c4d1bd 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -796,7 +796,7 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* haystack, STRINGLIB_CHAR s_last; // Counters Py_ssize_t count = 0; - Py_ssize_t iloop = 0, iloop_last = 0; + Py_ssize_t iloop = 0; Py_ssize_t ihits = 0, ihits_last = 0; // Loop for (i = 0; i <= w;) { @@ -876,7 +876,6 @@ STRINGLIB(horspool_find)(const STRINGLIB_CHAR* haystack, } } ihits_last = ihits; - iloop_last = iloop; } } // Loop Counter and False Hit Counter Logging From d6ac158685d5445a1fc27a6574d01ef7bfc1f7a1 Mon Sep 17 00:00:00 2001 From: "d.grigonis" Date: Thu, 27 Jun 2024 15:07:44 +0300 Subject: [PATCH 26/28] microopt --- Objects/stringlib/fastsearch.h | 36 ++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 8c5090b6c4d1bd..87e15f05f417b3 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -402,22 +402,24 @@ STRINGLIB(prepare_search)(STRINGLIB(prework) *pw, int dir = reversed ? -1 : 1; Py_ssize_t stt = reversed ? m - 1 : 0; Py_ssize_t end = reversed ? 0 : m - 1; + Py_ssize_t j, jp; // 1. Containment mask and Full "Good Suffix" gap for last character if (bloom_mask_and_true_gap) { const STRINGLIB_CHAR p_last = p[end]; pw->mask = 0; pw->true_gap = m; // Note: true_gap("___aa") = 1 - Py_ssize_t jp; STRINGLIB_CHAR p_tmp; STRINGLIB_BLOOM_ADD(pw->mask, p_last); - for (Py_ssize_t j = 1; j < m; j++) { - jp = end + (reversed ? j : -j); + j = 1; + jp = end - dir * j; + for (; j < m; j++) { p_tmp = p[jp]; STRINGLIB_BLOOM_ADD(pw->mask, p_tmp); if (pw->true_gap == m && p_tmp == p_last) { pw->true_gap = j; } + jp -= dir; } LOG("Good Suffix Full Gap: %ld\n", pw->true_gap); } @@ -425,25 +427,31 @@ STRINGLIB(prepare_search)(STRINGLIB(prework) *pw, if (bc_table_and_gap) { // 2.1. Fill a compressed Boyer-Moore "Bad Character" table Py_ssize_t not_found_shift = Py_MIN(m, MAX_SHIFT); - for (Py_ssize_t i = 0; i < (Py_ssize_t)TABLE_SIZE; i++) { - pw->table[i] = Py_SAFE_DOWNCAST(not_found_shift, + for (Py_ssize_t j = 0; j < (Py_ssize_t)TABLE_SIZE; j++) { + pw->table[j] = Py_SAFE_DOWNCAST(not_found_shift, Py_ssize_t, SHIFT_TYPE); } - for (Py_ssize_t i = m - not_found_shift; i < m; i++) { - SHIFT_TYPE shift = Py_SAFE_DOWNCAST(m - 1 - i, + j = m - not_found_shift; + jp = stt + dir * j; + for (; j < m; j++) { + SHIFT_TYPE shift = Py_SAFE_DOWNCAST(m - 1 - j, Py_ssize_t, SHIFT_TYPE); - pw->table[p[stt + dir*i] & TABLE_MASK] = shift; + pw->table[p[jp] & TABLE_MASK] = shift; + jp += dir; } // 2.2. Initialize "Good Suffix" Last Character Gap // Note: gap("___aa") = 1 pw->gap = m; STRINGLIB_CHAR last = p[end] & TABLE_MASK; - for (Py_ssize_t i = 1; i < m; i++) { - STRINGLIB_CHAR x = p[end - dir*i] & TABLE_MASK; + j = 1; + jp = end - dir * j; + for (; j < m; j++) { + STRINGLIB_CHAR x = p[jp] & TABLE_MASK; if (x == last) { - pw->gap = i; + pw->gap = j; break; } + jp -= dir; } LOG("Good Suffix Partial Gap: %ld\n", pw->gap); } @@ -579,9 +587,9 @@ STRINGLIB(two_way_find)(const STRINGLIB_CHAR *haystack, assert((ss[ip] & TABLE_MASK) == (p[m - 1] & TABLE_MASK)); j_off = ip - p_end; j = is_periodic ? Py_MAX(cut, memory) : cut; + jp = p_stt + dir * j; for (; j < m; j++) { ihits++; - jp = p_stt + (reversed ? -j : j); LOG2("Checking j=%ld: %c vs %c\n", j, ss[j_off + jp], p[jp]); if (ss[j_off + jp] != p[jp]) { if (j < gap_jump_end) { @@ -597,15 +605,16 @@ STRINGLIB(two_way_find)(const STRINGLIB_CHAR *haystack, memory = 0; break; } + jp += dir; } if (j != m) { continue; } j = Py_MIN(memory, cut); // Needed for j == cut below to be correct + jp = p_stt + dir * j; for (; j < cut; j++) { ihits++; - jp = p_stt + (reversed ? -j : j); LOG2("Checking j=%ld: %c vs %c\n", j, ss[j_off + jp], p[jp]); if (ss[j_off + jp] != p[jp]) { LOG("First half does not match.\n"); @@ -616,6 +625,7 @@ STRINGLIB(two_way_find)(const STRINGLIB_CHAR *haystack, i += period; break; } + jp += dir; } if (j == cut) { LOG("Found a match!\n"); From c26dfe9da4e277d3781bcb5f7fc58e46a1831d71 Mon Sep 17 00:00:00 2001 From: "d.grigonis" Date: Thu, 27 Jun 2024 18:31:36 +0300 Subject: [PATCH 27/28] 30% faster critical factorization --- Objects/stringlib/fastsearch.h | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 87e15f05f417b3..9f0b4ca60f2222 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -237,16 +237,25 @@ STRINGLIB(_lex_search)(const STRINGLIB_CHAR *needle, // The period of the right half. Py_ssize_t period = 1; STRINGLIB_CHAR a, b; + // directional indexers for candidate & max_suffix + // It is more efficient way to achieve: + // a = needle[stt + dir * (candidate + k)]; + // b = needle[stt + dir * (max_suffix + k)]; + Py_ssize_t cp, sp; + cp = stt + dir * candidate; + sp = stt + dir * max_suffix; while (candidate + k < needle_len) { // each loop increases (in chosen direction) candidate + k + max_suffix - a = needle[stt + dir*(candidate + k)]; - b = needle[stt + dir*(max_suffix + k)]; + a = needle[cp]; + b = needle[sp]; // check if the suffix at candidate is better than max_suffix if (invert_alphabet ? (b < a) : (a < b)) { // Fell short of max_suffix. // The next k + 1 characters are non-increasing // from candidate, so they won't start a maximal suffix. candidate += k + 1; + cp += dir; + sp -= dir * k; k = 0; // We've ruled out any period smaller than what's // been scanned since max_suffix. @@ -256,11 +265,15 @@ STRINGLIB(_lex_search)(const STRINGLIB_CHAR *needle, if (k + 1 != period) { // Keep scanning the equal strings k++; + cp += dir; + sp += dir; } else { // Matched a whole period. // Start matching the next period. candidate += period; + cp = stt + dir * candidate; + sp -= dir * k; k = 0; } } @@ -270,6 +283,8 @@ STRINGLIB(_lex_search)(const STRINGLIB_CHAR *needle, candidate++; k = 0; period = 1; + cp = stt + dir * candidate; + sp = stt + dir * max_suffix; } } *return_period = period; From f75f4d1824fcdb4e21bf693f8d54e1dcc035d816 Mon Sep 17 00:00:00 2001 From: "d.grigonis" Date: Thu, 27 Jun 2024 21:22:52 +0300 Subject: [PATCH 28/28] remove unrelated change --- Objects/stringlib/stringdefs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Objects/stringlib/stringdefs.h b/Objects/stringlib/stringdefs.h index 230b9978bcde2a..484b98b7291309 100644 --- a/Objects/stringlib/stringdefs.h +++ b/Objects/stringlib/stringdefs.h @@ -6,8 +6,8 @@ compiled as unicode. */ #define STRINGLIB_IS_UNICODE 0 -#define FASTSEARCH fastsearch -#define STRINGLIB(F) stringlib_##F +#define FASTSEARCH fastsearch +#define STRINGLIB(F) stringlib_##F #define STRINGLIB_OBJECT PyBytesObject #define STRINGLIB_SIZEOF_CHAR 1 #define STRINGLIB_CHAR char