diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2024-06-08-23-23-45.gh-issue-119702.PYdAU2.rst b/Misc/NEWS.d/next/Core_and_Builtins/2024-06-08-23-23-45.gh-issue-119702.PYdAU2.rst new file mode 100644 index 00000000000000..9e90d255064adf --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2024-06-08-23-23-45.gh-issue-119702.PYdAU2.rst @@ -0,0 +1 @@ +Optimization of string search algorithms. Performance optimizations extended to reverse search (``str.rfind``). diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 05e700b06258f0..70acbc518747a6 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -25,21 +25,6 @@ #define FAST_SEARCH 1 #define FAST_RSEARCH 2 -#if LONG_BIT >= 128 -#define STRINGLIB_BLOOM_WIDTH 128 -#elif LONG_BIT >= 64 -#define STRINGLIB_BLOOM_WIDTH 64 -#elif LONG_BIT >= 32 -#define STRINGLIB_BLOOM_WIDTH 32 -#else -#error "LONG_BIT is smaller than 32" -#endif - -#define STRINGLIB_BLOOM_ADD(mask, ch) \ - ((mask |= (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1))))) -#define STRINGLIB_BLOOM(mask, ch) \ - ((mask & (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1))))) - #ifdef STRINGLIB_FAST_MEMCHR # define MEMCHR_CUT_OFF 15 #else @@ -178,44 +163,99 @@ STRINGLIB(rfind_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch) #undef MEMRCHR_CUT_OFF -/* Change to a 1 to see logging comments walk through the algorithm. */ -#if 0 && STRINGLIB_SIZEOF_CHAR == 1 + +/* Change to 1 or 2 to see logging comments walk through the algorithm. + * LOG_LEVEL == 1 print excludes input strings (useful for long inputs) + * LOG_LEVEL == 2 print includes input alignments */ +# define LOG_LEVEL 0 +#if LOG_LEVEL == 1 && STRINGLIB_SIZEOF_CHAR == 1 +# define LOG(...) printf(__VA_ARGS__) +# define LOG2(...) +# define LOG_STRING(s, n) if (n < 100) { \ + printf("\"%.*s\"", (int)(n), s); \ +} +# define LOG_LINEUP() +#elif LOG_LEVEL == 2 && STRINGLIB_SIZEOF_CHAR == 1 # define LOG(...) printf(__VA_ARGS__) -# define LOG_STRING(s, n) printf("\"%.*s\"", (int)(n), s) -# define LOG_LINEUP() do { \ - LOG("> "); LOG_STRING(haystack, len_haystack); LOG("\n> "); \ - LOG("%*s",(int)(window_last - haystack + 1 - len_needle), ""); \ - LOG_STRING(needle, len_needle); LOG("\n"); \ +# define LOG2(...) printf(__VA_ARGS__) +# define LOG_STRING(s, n) if (n < 100) { \ + printf("\"%.*s\"", (int)(n), s); \ +} +# define LOG_LINEUP() do { \ + if (n < 100) { \ + LOG("> "); LOG_STRING(s, n); \ + LOG("\n> "); LOG("%*s",(int)(ss - s + ip - p_end), ""); \ + LOG_STRING(p, m); LOG("\n"); \ + } \ } while(0) #else # define LOG(...) +# define LOG2(...) # define LOG_STRING(s, n) # define LOG_LINEUP() #endif + Py_LOCAL_INLINE(Py_ssize_t) -STRINGLIB(_lex_search)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle, - Py_ssize_t *return_period, int invert_alphabet) +STRINGLIB(_lex_search)(const STRINGLIB_CHAR *needle, + Py_ssize_t needle_len, + Py_ssize_t *return_period, + int invert_alphabet, + int reversed) { /* Do a lexicographic search. Essentially this: >>> max(needle[i:] for i in range(len(needle)+1)) - Also find the period of the right half. */ + Also find the period of the right half. + Direction: + reversed : {0, 1} + if reversed == 1, then the problem is reverse + In short: + _lex_search(x, 1) == _lex_search(x[::-1], 0) + + Returned cut is the size of the cut towards chosen reversed. E.g.: + >>> x = '1234' + >>> cut, period = factorize(x, reversed=0) # cut = 0 + >>> cut + 0 + >>> cut_idx = cut + >>> x[:cut_idx], x[cut_idx:] + '', '1234' + >>> x = '4321' + >>> cut, period = factorize(x, reversed=1) + >>> cut + 0 + >>> cut_idx = len(x) - cut + >>> x[:cut_idx], x[cut_idx:] + '4321', '' + */ + int dir = reversed ? -1 : 1; + // starting position from chosen direction + Py_ssize_t stt = reversed ? needle_len - 1 : 0; Py_ssize_t max_suffix = 0; Py_ssize_t candidate = 1; Py_ssize_t k = 0; // The period of the right half. Py_ssize_t period = 1; - - while (candidate + k < len_needle) { - // each loop increases candidate + k + max_suffix - STRINGLIB_CHAR a = needle[candidate + k]; - STRINGLIB_CHAR b = needle[max_suffix + k]; + STRINGLIB_CHAR a, b; + // directional indexers for candidate & max_suffix + // It is more efficient way to achieve: + // a = needle[stt + dir * (candidate + k)]; + // b = needle[stt + dir * (max_suffix + k)]; + Py_ssize_t cp, sp; + cp = stt + dir * candidate; + sp = stt + dir * max_suffix; + while (candidate + k < needle_len) { + // each loop increases (in chosen direction) candidate + k + max_suffix + a = needle[cp]; + b = needle[sp]; // check if the suffix at candidate is better than max_suffix if (invert_alphabet ? (b < a) : (a < b)) { // Fell short of max_suffix. // The next k + 1 characters are non-increasing // from candidate, so they won't start a maximal suffix. candidate += k + 1; + cp += dir; + sp -= dir * k; k = 0; // We've ruled out any period smaller than what's // been scanned since max_suffix. @@ -225,11 +265,15 @@ STRINGLIB(_lex_search)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle, if (k + 1 != period) { // Keep scanning the equal strings k++; + cp += dir; + sp += dir; } else { // Matched a whole period. // Start matching the next period. candidate += period; + cp = stt + dir * candidate; + sp -= dir * k; k = 0; } } @@ -239,16 +283,20 @@ STRINGLIB(_lex_search)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle, candidate++; k = 0; period = 1; + cp = stt + dir * candidate; + sp = stt + dir * max_suffix; } } *return_period = period; return max_suffix; } + Py_LOCAL_INLINE(Py_ssize_t) STRINGLIB(_factorize)(const STRINGLIB_CHAR *needle, - Py_ssize_t len_needle, - Py_ssize_t *return_period) + Py_ssize_t needle_len, + Py_ssize_t *return_period, + int reversed) { /* Do a "critical factorization", making it so that: >>> needle = (left := needle[:cut]) + (right := needle[cut:]) @@ -280,12 +328,18 @@ STRINGLIB(_factorize)(const STRINGLIB_CHAR *needle, GC | AGAGAG AGAGAGC = AGAGAGC The length of this minimal repetition is 7, which is indeed the - period of the original string. */ - + period of the original string. + + Direction: + reversed : {0, 1} + if reversed == 1, then the problem is reverse + In short: + _factorize(x, 1) == _factorize(x[::-1], 0) + See docstring of _lex_search if still unclear + */ Py_ssize_t cut1, period1, cut2, period2, cut, period; - cut1 = STRINGLIB(_lex_search)(needle, len_needle, &period1, 0); - cut2 = STRINGLIB(_lex_search)(needle, len_needle, &period2, 1); - + cut1 = STRINGLIB(_lex_search)(needle, needle_len, &period1, 0, reversed); + cut2 = STRINGLIB(_lex_search)(needle, needle_len, &period2, 1, reversed); // Take the later cut. if (cut1 > cut2) { period = period1; @@ -295,446 +349,579 @@ STRINGLIB(_factorize)(const STRINGLIB_CHAR *needle, period = period2; cut = cut2; } - - LOG("split: "); LOG_STRING(needle, cut); - LOG(" + "); LOG_STRING(needle + cut, len_needle - cut); - LOG("\n"); - *return_period = period; return cut; } +// Bloom Setup +#if LONG_BIT >= 128 +# define STRINGLIB_BLOOM_MASK 127 +#elif LONG_BIT >= 64 +# define STRINGLIB_BLOOM_MASK 63 +#elif LONG_BIT >= 32 +# define STRINGLIB_BLOOM_MASK 31 +#else +# error "LONG_BIT is smaller than 32" +#endif + +#define STRINGLIB_BLOOM_ADD(mask, ch) \ + ((mask |= (1UL << ((ch) & (STRINGLIB_BLOOM_MASK))))) +#define STRINGLIB_BLOOM(mask, ch) \ + ((mask & (1UL << ((ch) & (STRINGLIB_BLOOM_MASK))))) + +// Boyer-Moore "Bad Character" table Setup #define SHIFT_TYPE uint8_t #define MAX_SHIFT UINT8_MAX -#define TABLE_SIZE_BITS 6u -#define TABLE_SIZE (1U << TABLE_SIZE_BITS) -#define TABLE_MASK (TABLE_SIZE - 1U) +#define TABLE_SIZE 128U +#define TABLE_MASK 127U + typedef struct STRINGLIB(_pre) { - const STRINGLIB_CHAR *needle; - Py_ssize_t len_needle; - Py_ssize_t cut; - Py_ssize_t period; - Py_ssize_t gap; + // Data & Direction + const STRINGLIB_CHAR *needle; // needle + Py_ssize_t needle_len; // length of the needle + int reversed; // Reverse Direction + // Containment mask and Full "Good Suffix" gap for last character + unsigned long mask; // Containment bloom type mask + Py_ssize_t true_gap; // Actual Last character gap + // "Bad Character" table and Filtered "Good Suffix" gap for last character + SHIFT_TYPE table[TABLE_SIZE]; // Boyer-Moore "Bad Character" table + Py_ssize_t gap; // Filterd Last Character Gap <= true_gap + // Critical Factorization + Py_ssize_t cut; // Critical Factorization Cut Length + Py_ssize_t period; // Global Period of the string int is_periodic; - SHIFT_TYPE table[TABLE_SIZE]; } STRINGLIB(prework); static void -STRINGLIB(_preprocess)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle, - STRINGLIB(prework) *p) +STRINGLIB(prepare_search)(STRINGLIB(prework) *pw, + int bloom_mask_and_true_gap, + int bc_table_and_gap, + int critical_factorization) { - p->needle = needle; - p->len_needle = len_needle; - p->cut = STRINGLIB(_factorize)(needle, len_needle, &(p->period)); - assert(p->period + p->cut <= len_needle); - p->is_periodic = (0 == memcmp(needle, - needle + p->period, - p->cut * STRINGLIB_SIZEOF_CHAR)); - if (p->is_periodic) { - assert(p->cut <= len_needle/2); - assert(p->cut < p->period); - } - else { - // A lower bound on the period - p->period = Py_MAX(p->cut, len_needle - p->cut) + 1; - } - // The gap between the last character and the previous - // occurrence of an equivalent character (modulo TABLE_SIZE) - p->gap = len_needle; - STRINGLIB_CHAR last = needle[len_needle - 1] & TABLE_MASK; - for (Py_ssize_t i = len_needle - 2; i >= 0; i--) { - STRINGLIB_CHAR x = needle[i] & TABLE_MASK; - if (x == last) { - p->gap = len_needle - 1 - i; - break; + /* + This function prepares different search algorithm methods + + horspool_find: bc_table_and_gap = 1 + bloom_mask_and_true_gap = 1 (optional) + + two_way_find: bc_table_and_gap = 1 + critical_factorization = 1 + */ + const STRINGLIB_CHAR *p = pw->needle; + Py_ssize_t m = pw->needle_len; + int reversed = pw->reversed; + int dir = reversed ? -1 : 1; + Py_ssize_t stt = reversed ? m - 1 : 0; + Py_ssize_t end = reversed ? 0 : m - 1; + Py_ssize_t j, jp; + // 1. Containment mask and Full "Good Suffix" gap for last character + if (bloom_mask_and_true_gap) { + const STRINGLIB_CHAR p_last = p[end]; + pw->mask = 0; + pw->true_gap = m; + // Note: true_gap("___aa") = 1 + STRINGLIB_CHAR p_tmp; + STRINGLIB_BLOOM_ADD(pw->mask, p_last); + j = 1; + jp = end - dir * j; + for (; j < m; j++) { + p_tmp = p[jp]; + STRINGLIB_BLOOM_ADD(pw->mask, p_tmp); + if (pw->true_gap == m && p_tmp == p_last) { + pw->true_gap = j; + } + jp -= dir; } + LOG("Good Suffix Full Gap: %ld\n", pw->true_gap); } - // Fill up a compressed Boyer-Moore "Bad Character" table - Py_ssize_t not_found_shift = Py_MIN(len_needle, MAX_SHIFT); - for (Py_ssize_t i = 0; i < (Py_ssize_t)TABLE_SIZE; i++) { - p->table[i] = Py_SAFE_DOWNCAST(not_found_shift, - Py_ssize_t, SHIFT_TYPE); - } - for (Py_ssize_t i = len_needle - not_found_shift; i < len_needle; i++) { - SHIFT_TYPE shift = Py_SAFE_DOWNCAST(len_needle - 1 - i, - Py_ssize_t, SHIFT_TYPE); - p->table[needle[i] & TABLE_MASK] = shift; - } -} -static Py_ssize_t -STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, - STRINGLIB(prework) *p) -{ - // Crochemore and Perrin's (1991) Two-Way algorithm. - // See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 - const Py_ssize_t len_needle = p->len_needle; - const Py_ssize_t cut = p->cut; - Py_ssize_t period = p->period; - const STRINGLIB_CHAR *const needle = p->needle; - const STRINGLIB_CHAR *window_last = haystack + len_needle - 1; - const STRINGLIB_CHAR *const haystack_end = haystack + len_haystack; - SHIFT_TYPE *table = p->table; - const STRINGLIB_CHAR *window; - LOG("===== Two-way: \"%s\" in \"%s\". =====\n", needle, haystack); - - Py_ssize_t gap = p->gap; - Py_ssize_t gap_jump_end = Py_MIN(len_needle, cut + gap); - if (p->is_periodic) { - LOG("Needle is periodic.\n"); - Py_ssize_t memory = 0; - periodicwindowloop: - while (window_last < haystack_end) { - assert(memory == 0); - for (;;) { - LOG_LINEUP(); - Py_ssize_t shift = table[(*window_last) & TABLE_MASK]; - window_last += shift; - if (shift == 0) { - break; - } - if (window_last >= haystack_end) { - return -1; - } - LOG("Horspool skip\n"); - } - no_shift: - window = window_last - len_needle + 1; - assert((window[len_needle - 1] & TABLE_MASK) == - (needle[len_needle - 1] & TABLE_MASK)); - Py_ssize_t i = Py_MAX(cut, memory); - for (; i < len_needle; i++) { - if (needle[i] != window[i]) { - if (i < gap_jump_end) { - LOG("Early right half mismatch: jump by gap.\n"); - assert(gap >= i - cut + 1); - window_last += gap; - } - else { - LOG("Late right half mismatch: jump by n (>gap)\n"); - assert(i - cut + 1 > gap); - window_last += i - cut + 1; - } - memory = 0; - goto periodicwindowloop; - } - } - for (i = memory; i < cut; i++) { - if (needle[i] != window[i]) { - LOG("Left half does not match.\n"); - window_last += period; - memory = len_needle - period; - if (window_last >= haystack_end) { - return -1; - } - Py_ssize_t shift = table[(*window_last) & TABLE_MASK]; - if (shift) { - // A mismatch has been identified to the right - // of where i will next start, so we can jump - // at least as far as if the mismatch occurred - // on the first comparison. - Py_ssize_t mem_jump = Py_MAX(cut, memory) - cut + 1; - LOG("Skip with Memory.\n"); - memory = 0; - window_last += Py_MAX(shift, mem_jump); - goto periodicwindowloop; - } - goto no_shift; - } - } - LOG("Found a match!\n"); - return window - haystack; + if (bc_table_and_gap) { + // 2.1. Fill a compressed Boyer-Moore "Bad Character" table + Py_ssize_t not_found_shift = Py_MIN(m, MAX_SHIFT); + for (Py_ssize_t j = 0; j < (Py_ssize_t)TABLE_SIZE; j++) { + pw->table[j] = Py_SAFE_DOWNCAST(not_found_shift, + Py_ssize_t, SHIFT_TYPE); } - } - else { - period = Py_MAX(gap, period); - LOG("Needle is not periodic.\n"); - windowloop: - while (window_last < haystack_end) { - for (;;) { - LOG_LINEUP(); - Py_ssize_t shift = table[(*window_last) & TABLE_MASK]; - window_last += shift; - if (shift == 0) { - break; - } - if (window_last >= haystack_end) { - return -1; - } - LOG("Horspool skip\n"); - } - window = window_last - len_needle + 1; - assert((window[len_needle - 1] & TABLE_MASK) == - (needle[len_needle - 1] & TABLE_MASK)); - Py_ssize_t i = cut; - for (; i < len_needle; i++) { - if (needle[i] != window[i]) { - if (i < gap_jump_end) { - LOG("Early right half mismatch: jump by gap.\n"); - assert(gap >= i - cut + 1); - window_last += gap; - } - else { - LOG("Late right half mismatch: jump by n (>gap)\n"); - assert(i - cut + 1 > gap); - window_last += i - cut + 1; - } - goto windowloop; - } - } - for (Py_ssize_t i = 0; i < cut; i++) { - if (needle[i] != window[i]) { - LOG("Left half does not match.\n"); - window_last += period; - goto windowloop; - } + j = m - not_found_shift; + jp = stt + dir * j; + for (; j < m; j++) { + SHIFT_TYPE shift = Py_SAFE_DOWNCAST(m - 1 - j, + Py_ssize_t, SHIFT_TYPE); + pw->table[p[jp] & TABLE_MASK] = shift; + jp += dir; + } + // 2.2. Initialize "Good Suffix" Last Character Gap + // Note: gap("___aa") = 1 + pw->gap = m; + STRINGLIB_CHAR last = p[end] & TABLE_MASK; + j = 1; + jp = end - dir * j; + for (; j < m; j++) { + STRINGLIB_CHAR x = p[jp] & TABLE_MASK; + if (x == last) { + pw->gap = j; + break; } - LOG("Found a match!\n"); - return window - haystack; + jp -= dir; } + LOG("Good Suffix Partial Gap: %ld\n", pw->gap); } - LOG("Not found. Returning -1.\n"); - return -1; -} - -static Py_ssize_t -STRINGLIB(_two_way_find)(const STRINGLIB_CHAR *haystack, - Py_ssize_t len_haystack, - const STRINGLIB_CHAR *needle, - Py_ssize_t len_needle) -{ - LOG("###### Finding \"%s\" in \"%s\".\n", needle, haystack); - STRINGLIB(prework) p; - STRINGLIB(_preprocess)(needle, len_needle, &p); - return STRINGLIB(_two_way)(haystack, len_haystack, &p); -} - - -static Py_ssize_t -STRINGLIB(_two_way_count)(const STRINGLIB_CHAR *haystack, - Py_ssize_t len_haystack, - const STRINGLIB_CHAR *needle, - Py_ssize_t len_needle, - Py_ssize_t maxcount) -{ - LOG("###### Counting \"%s\" in \"%s\".\n", needle, haystack); - STRINGLIB(prework) p; - STRINGLIB(_preprocess)(needle, len_needle, &p); - Py_ssize_t index = 0, count = 0; - while (1) { - Py_ssize_t result; - result = STRINGLIB(_two_way)(haystack + index, - len_haystack - index, &p); - if (result == -1) { - return count; + // 3. Calculate Critical Factorization + if (critical_factorization) { + Py_ssize_t period; + Py_ssize_t cut = STRINGLIB(_factorize)(p, m, &period, reversed); + assert(cut + period <= m); + int cmp; + if (reversed) { + Py_ssize_t cut_idx = m - cut; + cmp = memcmp(p + cut_idx, p + cut_idx - period, + cut * STRINGLIB_SIZEOF_CHAR); } - count++; - if (count == maxcount) { - return maxcount; + else { + cmp = memcmp(p, p + period, cut * STRINGLIB_SIZEOF_CHAR); } - index += result + len_needle; + int is_periodic = cmp == 0; + if (is_periodic) { + assert(cut <= m/2); + assert(cut < period); + LOG("Needle is periodic.\n"); + } + else { + // Upper bound on the period + period = Py_MAX(cut, m - cut) + 1; + LOG("Needle is not periodic.\n"); + } + pw->cut = cut; + pw->period = period; + pw->is_periodic = is_periodic; + + LOG("Cut: %ld & Period: %ld\n", cut, period); + LOG("split: "); + LOG_STRING(p, cut); + LOG(" + "); + LOG_STRING(p + cut, m - cut); + LOG("\n"); } - return count; } -#undef SHIFT_TYPE -#undef NOT_FOUND -#undef SHIFT_OVERFLOW -#undef TABLE_SIZE_BITS -#undef TABLE_SIZE -#undef TABLE_MASK -#undef LOG -#undef LOG_STRING -#undef LOG_LINEUP - -static inline Py_ssize_t -STRINGLIB(default_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, - const STRINGLIB_CHAR* p, Py_ssize_t m, - Py_ssize_t maxcount, int mode) +static Py_ssize_t +STRINGLIB(two_way_find)(const STRINGLIB_CHAR *haystack, + const Py_ssize_t haystack_len, + int find_mode, Py_ssize_t maxcount, + STRINGLIB(prework) *pw) { + /* Crochemore and Perrin's (1991) Two-Way algorithm. + See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 + + Initialization + pw needs to be initialized using prepare_search with + bc_table_and_gap = 1 and critical_factorization = 1 + + Variable Names + s - haystack + p - needle + n - len(haystack) + m - len(needle) + + Bi-Directional Conventions: + See docstring of horspool_find + + Critical factorization reversion: + See docstring of _factorize + */ + LOG(find_mode ? "Two-way Find.\n" : "Two-way Count.\n"); + + // Collect Data + int reversed = pw->reversed; + int dir = reversed ? -1 : 1; + const STRINGLIB_CHAR *const p = pw->needle; + const STRINGLIB_CHAR *const s = haystack; + const Py_ssize_t m = pw->needle_len; + const Py_ssize_t n = haystack_len; + LOG("haystack: "); LOG_STRING(s, n); LOG("\n"); + LOG("needle : "); LOG_STRING(p, m); LOG("\n"); + + // Retrieve Preparation + SHIFT_TYPE *table = pw->table; + const Py_ssize_t gap = pw->gap; + const Py_ssize_t cut = pw->cut; + int is_periodic = pw->is_periodic; + Py_ssize_t period = is_periodic ? pw->period : Py_MAX(gap, pw->period); + Py_ssize_t gap_jump_end = Py_MIN(m, cut + gap); + + // Direction Independent const Py_ssize_t w = n - m; - Py_ssize_t mlast = m - 1, count = 0; - Py_ssize_t gap = mlast; - const STRINGLIB_CHAR last = p[mlast]; - const STRINGLIB_CHAR *const ss = &s[mlast]; - - unsigned long mask = 0; - for (Py_ssize_t i = 0; i < mlast; i++) { - STRINGLIB_BLOOM_ADD(mask, p[i]); - if (p[i] == last) { - gap = mlast - i - 1; - } - } - STRINGLIB_BLOOM_ADD(mask, last); - - for (Py_ssize_t i = 0; i <= w; i++) { - if (ss[i] == last) { - /* candidate match */ - Py_ssize_t j; - for (j = 0; j < mlast; j++) { - if (s[i+j] != p[j]) { - break; - } + const Py_ssize_t m_m1 = m - 1; + // Direction Dependent + const Py_ssize_t p_stt = dir == 1 ? 0 : m - 1; + const Py_ssize_t s_stt = dir == 1 ? 0 : n - 1; + const Py_ssize_t p_end = dir == 1 ? m - 1 : 0; + const Py_ssize_t dir_m_m1 = reversed ? -m_m1 : m_m1; + const STRINGLIB_CHAR *const ss = s + s_stt + dir_m_m1; + + // Indexers (ip and jp are directional indices. e.g. ip = pos * i) + Py_ssize_t i, j, ip, jp; + // Temporary + Py_ssize_t j_off; // offset from last to leftmost window index + Py_ssize_t shift; + int do_mem_jump = 0; + // Counters + Py_ssize_t iloop = 0; + Py_ssize_t ihits = 0; + Py_ssize_t count = 0; + Py_ssize_t memory = 0; + // Loop + for (i = 0; i <= w;) { + iloop++; + ip = reversed ? -i : i; + LOG2("Last window ch: %c\n", ss[ip]); + LOG_LINEUP(); + shift = table[ss[ip] & TABLE_MASK]; + if (shift){ + if (do_mem_jump) { + // A mismatch has been identified to the right + // of where i will next start, so we can jump + // at least as far as if the mismatch occurred + // on the first comparison. + shift = Py_MAX(shift, Py_MAX(1, memory - cut + 1)); + memory = 0; + LOG("Skip with Memory: %ld\n", shift); } - if (j == mlast) { - /* got a match! */ - if (mode != FAST_COUNT) { - return i; + assert(memory == 0); + LOG("Shift: %ld\n", shift); + i += shift; + do_mem_jump = 0; + continue; + } + assert((ss[ip] & TABLE_MASK) == (p[m - 1] & TABLE_MASK)); + j_off = ip - p_end; + j = is_periodic ? Py_MAX(cut, memory) : cut; + jp = p_stt + dir * j; + for (; j < m; j++) { + ihits++; + LOG2("Checking j=%ld: %c vs %c\n", j, ss[j_off + jp], p[jp]); + if (ss[j_off + jp] != p[jp]) { + if (j < gap_jump_end) { + LOG("Early later half mismatch: jump by gap.\n"); + assert(gap >= j - cut + 1); + i += gap; } - count++; - if (count == maxcount) { - return maxcount; + else { + LOG("Late later half mismatch: jump by n (>gap)\n"); + assert(j - cut + 1 > gap); + i += j - cut + 1; } - i = i + mlast; - continue; - } - /* miss: check if next character is part of pattern */ - if (!STRINGLIB_BLOOM(mask, ss[i+1])) { - i = i + m; + memory = 0; + break; } - else { - i = i + gap; + jp += dir; + } + if (j != m) { + continue; + } + + j = Py_MIN(memory, cut); // Needed for j == cut below to be correct + jp = p_stt + dir * j; + for (; j < cut; j++) { + ihits++; + LOG2("Checking j=%ld: %c vs %c\n", j, ss[j_off + jp], p[jp]); + if (ss[j_off + jp] != p[jp]) { + LOG("First half does not match.\n"); + if (is_periodic) { + memory = m - period; + do_mem_jump = 1; + } + i += period; + break; } + jp += dir; } - else { - /* skip: check if next character is part of pattern */ - if (!STRINGLIB_BLOOM(mask, ss[i+1])) { - i = i + m; + if (j == cut) { + LOG("Found a match!\n"); + if (find_mode) { + return reversed ? w - i : i; + } + if (++count == maxcount) { + return maxcount; } + memory = 0; + i += m; } + } - return mode == FAST_COUNT ? count : -1; + // Loop Counter and Memory Access Counter Logging (Used for calibration) + // In worst case scenario iloop == n - m + // iloop == ihits indicates linear performance for quadratic problems + LOG("iloop: %ld\n", iloop); + LOG("ihits: %ld\n", ihits); + return find_mode ? -1 : count; } static Py_ssize_t -STRINGLIB(adaptive_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, - const STRINGLIB_CHAR* p, Py_ssize_t m, - Py_ssize_t maxcount, int mode) +STRINGLIB(horspool_find)(const STRINGLIB_CHAR* haystack, + const Py_ssize_t haystack_len, + int find_mode, Py_ssize_t maxcount, + int dynamic, STRINGLIB(prework) *pw) { + /* Boyer–Moore–Horspool algorithm + with optional dynamic fallback to Two-Way algorithm + + Initialization + pw needs to be initialized using prepare_search with + bc_table_and_gap = 1 and optionally bloom_mask_and_true_gap = 1 + + Variable Names + s - haystack + p - needle + n - len(haystack) + m - len(needle) + + Bi-Directional Conventions: + stt - start index + end - end index + ss - pointer to last window index that matches last needle character + >>> dir_fwd, dir_rev = 1, -1 + >>> s = [0, 1, 2, 3, 4, 5] + >>> s_stt_fwd, s_stt_rev = 0, 5 + >>> s_end_fwd, s_end_rev = 5, 0 + >>> p = [0, 1] + >>> m = len(p) + >>> s = 0 + >>> ss_fwd = s + s_stt_fwd + dir_fwd * (m - 1) + >>> ss_rev = s + s_stt_rev + dir_rev * (m - 1) + >>> ss_fwd, ss_rev + (1, 4) + + There is one more important variable: j_off + It brings ss in alignment with a needle. + So that it stands at the first absolute index of the window + + >>> i = 0 # first step + >>> p_stt_fwd, p_stt_rev = 0, 1 + >>> p_end_fwd, p_end_rev = 1, 0 + >>> j_off_fwd = dir_fwd * i - p_end_fwd + >>> ss_fwd + j_off_fwd + 0 + + such that [0, 1, 2, 3, 4, 5] + [0, 1] + * - both indices are at 0 + + >>> j_off_rev = dir_rev * i - p_end_rev + >>> ss_rev + j_off_rev + 4 + + such that [0, 1, 2, 3, 4, 5] + [0, 1] + * - both indices are at 0 + Finally, which side it iterates from is determined by: + jp = p_stt + (reversed ? -j : j); + , where j is an increasing needle-size counter in both cases + + With this transformation the problem becomes direction agnostic + + Dynamic mode + 'Horspool' algorithm will switch to `two_way_find` if it predicts + that it can solve the problem faster. + + Calibration + The simple model for run time of search algorithm is as follows: + loop - actual loop that happens (not theoretical) + init_cost - initialization cost per 1 needle character in ns + loop_cost - cost of 1 main loop + hit_cost - cost of 1 false positive character check + avg_hit - average number of false positive hits per 1 loop + + m = len(needle) + run_time = init_cost * m + n_loops * (loop_cost + hit_cost * avg_hit) + + Note, n_loops * avg_hit is what causes quadratic complexity. + + Calibrate: + 1. expose function to run without handling special cases first. + 2. set dynamic = 0 + 3. Enable counter printing to know how many hits and loops + happened. see printf(iloop|ihits) at the end of the function + + 4. init_cost = run_time(horspool_find(s='', p='*' * m)) / m + + 5. `two_way` only has loop cost. + run_time(two_way_find(s='*' * 1000)) - init_cost + loop_cost = ------------------------------------------------ + n_loops (from stdout) + Note, iloop & ihits of `two_way` should be the same. + + 6. To get loop_cost and hit_cost of `horspool_find` solve + equation system representing 2 different runs + n_loops1 * loop_cost + n_hits1 * hit_cost = run_time(problem_1) + n_loops2 * loop_cost + n_hits2 * hit_cost = run_time(problem_2) + + init_cost of `horspool` for larger problems is negligible + Furthermore, it is not used as it has already happened + + 7. Run above for different problems. Take averages. + Compare with current calibration constants. + + 8. Current calibration works well, but is not perfect. + Maybe you can come up with more accurate model. + */ + LOG(find_mode ? "Horspool Find\n" : "Horspool Count.\n"); + + // Collect Data + int reversed = pw->reversed; + int dir = reversed ? -1 : 1; + const STRINGLIB_CHAR *const p = pw->needle; + const STRINGLIB_CHAR *const s = haystack; + const Py_ssize_t m = pw->needle_len; + const Py_ssize_t n = haystack_len; + LOG("haystack: "); LOG_STRING(s, n); LOG("\n"); + LOG("needle : "); LOG_STRING(p, m); LOG("\n"); + + // Direction Independent + const Py_ssize_t m_m1 = m - 1; + const Py_ssize_t m_p1 = m + 1; const Py_ssize_t w = n - m; - Py_ssize_t mlast = m - 1, count = 0; - Py_ssize_t gap = mlast; - Py_ssize_t hits = 0, res; - const STRINGLIB_CHAR last = p[mlast]; - const STRINGLIB_CHAR *const ss = &s[mlast]; - - unsigned long mask = 0; - for (Py_ssize_t i = 0; i < mlast; i++) { - STRINGLIB_BLOOM_ADD(mask, p[i]); - if (p[i] == last) { - gap = mlast - i - 1; - } - } - STRINGLIB_BLOOM_ADD(mask, last); - - for (Py_ssize_t i = 0; i <= w; i++) { - if (ss[i] == last) { - /* candidate match */ - Py_ssize_t j; - for (j = 0; j < mlast; j++) { - if (s[i+j] != p[j]) { - break; - } + + // Retrieve Preparation + int bloom_on = pw->true_gap != 0; + unsigned long mask = pw->mask; + // If bloom_on last character always matches so no need to check it + Py_ssize_t j_stop = bloom_on ? m_m1 : m; + Py_ssize_t gap = bloom_on ? pw->true_gap : pw->gap; + SHIFT_TYPE *table = pw->table; + + // Direction Dependent + const Py_ssize_t s_stt = reversed ? n - 1 : 0; + const Py_ssize_t p_stt = reversed ? m - 1 : 0; + const Py_ssize_t p_end = reversed ? 0 : m - 1; + const Py_ssize_t dir_m_m1 = reversed ? -m_m1 : m_m1; + const STRINGLIB_CHAR *const ss = s + s_stt + dir_m_m1; + const STRINGLIB_CHAR p_last = p[p_end]; + + // Indexers (ip and jp are directional indices. e.g. ip = pos * i) + Py_ssize_t i, j, ip, jp; + + // Horspool Calibration + const double hrs_lcost = 4.0; // average loop cost + const double hrs_hcost = 0.4; // false positive hit cost + // Two-Way Calibration + const double twy_icost = 3.5 * (double)m; // total initialization cost + const double twy_lcost = 3.0; // loop cost + // Temporary + double exp_hrs, exp_twy, ll; // expected run times & loops left + Py_ssize_t j_off; // offset from last to leftmost window index + Py_ssize_t shift; + STRINGLIB_CHAR s_last; + // Counters + Py_ssize_t count = 0; + Py_ssize_t iloop = 0; + Py_ssize_t ihits = 0, ihits_last = 0; + // Loop + for (i = 0; i <= w;) { + iloop++; + ip = reversed ? -i : i; + s_last = ss[ip]; + LOG2("Last window ch: %c\n", s_last); + if (bloom_on) { + shift = 0; + if (s_last != p_last) { + shift = i < w && !STRINGLIB_BLOOM(mask, ss[ip+dir]) ? + m_p1 : Py_MAX(table[s_last & TABLE_MASK], 1); } - if (j == mlast) { - /* got a match! */ - if (mode != FAST_COUNT) { - return i; - } - count++; - if (count == maxcount) { - return maxcount; - } - i = i + mlast; + if (shift) { + LOG("Shift: %ld\n", shift); + i += shift; continue; } - hits += j + 1; - if (hits > m / 4 && w - i > 2000) { - if (mode == FAST_SEARCH) { - res = STRINGLIB(_two_way_find)(s + i, n - i, p, m); - return res == -1 ? -1 : res + i; - } - else { - res = STRINGLIB(_two_way_count)(s + i, n - i, p, m, - maxcount - count); - return res + count; - } - } - /* miss: check if next character is part of pattern */ - if (!STRINGLIB_BLOOM(mask, ss[i+1])) { - i = i + m; - } - else { - i = i + gap; - } + assert(s_last == p_last); } else { - /* skip: check if next character is part of pattern */ - if (!STRINGLIB_BLOOM(mask, ss[i+1])) { - i = i + m; + shift = table[s_last & TABLE_MASK]; + if (shift) { + LOG("Shift: %ld\n", shift); + i += shift; + continue; } + assert((s_last & TABLE_MASK) == (p_last & TABLE_MASK)); } - } - return mode == FAST_COUNT ? count : -1; -} - - -static Py_ssize_t -STRINGLIB(default_rfind)(const STRINGLIB_CHAR* s, Py_ssize_t n, - const STRINGLIB_CHAR* p, Py_ssize_t m, - Py_ssize_t maxcount, int mode) -{ - /* create compressed boyer-moore delta 1 table */ - unsigned long mask = 0; - Py_ssize_t i, j, mlast = m - 1, skip = m - 1, w = n - m; - - /* process pattern[0] outside the loop */ - STRINGLIB_BLOOM_ADD(mask, p[0]); - /* process pattern[:0:-1] */ - for (i = mlast; i > 0; i--) { - STRINGLIB_BLOOM_ADD(mask, p[i]); - if (p[i] == p[0]) { - skip = i - 1; + j_off = ip - p_end; + jp = p_stt; + for (j = 0; j < j_stop; j++) { + ihits++; + LOG2("Checking j=%ld: %c vs %c\n", j, ss[j_off + jp], p[jp]); + if (ss[j_off + jp] != p[jp]) { + break; + } + jp += dir; } - } - for (i = w; i >= 0; i--) { - if (s[i] == p[0]) { - /* candidate match */ - for (j = mlast; j > 0; j--) { - if (s[i+j] != p[j]) { - break; - } - } - if (j == 0) { - /* got a match! */ - return i; - } - /* miss: check if previous character is part of pattern */ - if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1])) { - i = i - m; + if (j == j_stop) { + LOG("Found a match!\n"); + if (find_mode) { + return reversed ? w - i : i; } - else { - i = i - skip; + if (++count == maxcount) { + return maxcount; } + i += m; + } + else if (bloom_on && i < w && !STRINGLIB_BLOOM(mask, ss[ip+dir])) { + // full skip: check if next character is part of pattern + LOG("Move by (m + 1) = %ld\n", m_p1); + i += m_p1; } else { - /* skip: check if previous character is part of pattern */ - if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1])) { - i = i - m; + LOG("Move by gap = %ld\n", gap); + i += gap; + } + + if (dynamic && ihits - ihits_last >= 100) { + ll = (double)(w - i + 1); + exp_hrs = ((double)iloop * hrs_lcost + + (double)ihits * hrs_hcost) / (double)i * ll; + exp_twy = twy_icost + ll * twy_lcost; + if (exp_twy < exp_hrs) { + LOG("switching to two-way algorithm: n=%ld, m=%ld\n", n, m); + STRINGLIB(prepare_search)(pw, 0, 0, 1); + Py_ssize_t res = STRINGLIB(two_way_find)( + reversed ? s : s + i, n - i, + find_mode, maxcount - count, pw); + if (find_mode) { + return res == -1 ? -1 : (reversed ? res : res + i); + } + else { + return res + count; + } } + ihits_last = ihits; } } - return -1; + // Loop Counter and False Hit Counter Logging + // In worst case scenario: ihits == (n - m) * m + // Used for calibration and fallback decision + LOG("iloop: %ld\n", iloop); + LOG("ihits: %ld\n", ihits); + return find_mode ? -1 : count; } +#undef STRINGLIB_BLOOM_MASK + +#undef SHIFT_TYPE +#undef TABLE_SIZE +#undef TABLE_MASK + +#undef LOG +#undef LOG_STRING +#undef LOG_LINEUP +#undef LOG_LEVEL + static inline Py_ssize_t STRINGLIB(count_char)(const STRINGLIB_CHAR *s, Py_ssize_t n, @@ -777,7 +964,6 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, if (n < m || (mode == FAST_COUNT && maxcount == 0)) { return -1; } - /* look for special cases */ if (m <= 1) { if (m <= 0) { @@ -795,36 +981,26 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, return STRINGLIB(count_char)(s, n, p[0], maxcount); } } - - if (mode != FAST_RSEARCH) { - if (n < 2500 || (m < 100 && n < 30000) || m < 6) { - return STRINGLIB(default_find)(s, n, p, m, maxcount, mode); - } - else if ((m >> 2) * 3 < (n >> 2)) { - /* 33% threshold, but don't overflow. */ - /* For larger problems where the needle isn't a huge - percentage of the size of the haystack, the relatively - expensive O(m) startup cost of the two-way algorithm - will surely pay off. */ - if (mode == FAST_SEARCH) { - return STRINGLIB(_two_way_find)(s, n, p, m); - } - else { - return STRINGLIB(_two_way_count)(s, n, p, m, maxcount); - } - } - else { - /* To ensure that we have good worst-case behavior, - here's an adaptive version of the algorithm, where if - we match O(m) characters without any matches of the - entire needle, then we predict that the startup cost of - the two-way algorithm will probably be worth it. */ - return STRINGLIB(adaptive_find)(s, n, p, m, maxcount, mode); - } - } - else { - /* FAST_RSEARCH */ - return STRINGLIB(default_rfind)(s, n, p, m, maxcount, mode); + else if (n == m) { + /* use special case when both strings are of equal length */ + int res = memcmp(s, p, m * sizeof(STRINGLIB_CHAR)); + if (mode == FAST_COUNT){ + return res == 0 ? 1 : 0; + } else { + return res == 0 ? 0 : -1; + } } -} + STRINGLIB(prework) pw = { + .needle = p, + .needle_len = m, + .reversed = mode == FAST_RSEARCH, + .true_gap = 0, + }; + // Use Bloom for len(haystack) >= 30 * len(needle) + int bloom_mask_and_true_gap = n >= 30 * m; + STRINGLIB(prepare_search)(&pw, bloom_mask_and_true_gap, 1, 0); + int find_mode = mode != FAST_COUNT; + int dynamic = 1; + return STRINGLIB(horspool_find)(s, n, find_mode, maxcount, dynamic, &pw); +}