Refactor \X regex handling to avoid a typical case table lookup

Prior to this commit 98.4% of Unicode code points that went through \X had to be looked up to see if they begin a grapheme cluster; then looked up again to find that they didn't require special handling. This commit refactors things so only one look-up is required for those 98.4%. It changes the table generated by mktables to accomplish this, and hence the name of it, and references to it are changed to correspond.
schacon · Aug 28, 2012 · 27d4fc3 · 27d4fc3
1 parent 9914684
commit 27d4fc3
Show file tree

Hide file tree

Showing 9 changed files with 41 additions and 36 deletions.
diff --git a/embed.fnc b/embed.fnc
@@ -660,9 +660,9 @@ ApR	|bool	|is_utf8_print	|NN const U8 *p
 ApR	|bool	|is_utf8_punct	|NN const U8 *p
 ApR	|bool	|is_utf8_xdigit	|NN const U8 *p
 ApR	|bool	|is_utf8_mark	|NN const U8 *p
-EXpR	|bool	|is_utf8_X_begin	|NN const U8 *p
 EXpR	|bool	|is_utf8_X_extend	|NN const U8 *p
 EXpR	|bool	|is_utf8_X_prepend	|NN const U8 *p
+EXpR	|bool	|is_utf8_X_regular_begin|NN const U8 *p
 EXpR	|bool	|is_utf8_X_special_begin|NN const U8 *p
 EXpR	|bool	|is_utf8_X_L		|NN const U8 *p
 EXpR	|bool	|is_utf8_X_RI		|NN const U8 *p

diff --git a/embed.h b/embed.h
@@ -861,9 +861,9 @@
 #define is_utf8_X_RI(a)		Perl_is_utf8_X_RI(aTHX_ a)
 #define is_utf8_X_T(a)		Perl_is_utf8_X_T(aTHX_ a)
 #define is_utf8_X_V(a)		Perl_is_utf8_X_V(aTHX_ a)
-#define is_utf8_X_begin(a)	Perl_is_utf8_X_begin(aTHX_ a)
 #define is_utf8_X_extend(a)	Perl_is_utf8_X_extend(aTHX_ a)
 #define is_utf8_X_prepend(a)	Perl_is_utf8_X_prepend(aTHX_ a)
+#define is_utf8_X_regular_begin(a)	Perl_is_utf8_X_regular_begin(aTHX_ a)
 #define is_utf8_X_special_begin(a)	Perl_is_utf8_X_special_begin(aTHX_ a)
 #define op_clear(a)		Perl_op_clear(aTHX_ a)
 #define qerror(a)		Perl_qerror(aTHX_ a)

diff --git a/embedvar.h b/embedvar.h
@@ -359,9 +359,9 @@
 #define PL_utf8_X_RI		(vTHX->Iutf8_X_RI)
 #define PL_utf8_X_T		(vTHX->Iutf8_X_T)
 #define PL_utf8_X_V		(vTHX->Iutf8_X_V)
-#define PL_utf8_X_begin		(vTHX->Iutf8_X_begin)
 #define PL_utf8_X_extend	(vTHX->Iutf8_X_extend)
 #define PL_utf8_X_prepend	(vTHX->Iutf8_X_prepend)
+#define PL_utf8_X_regular_begin	(vTHX->Iutf8_X_regular_begin)
 #define PL_utf8_X_special_begin	(vTHX->Iutf8_X_special_begin)
 #define PL_utf8_alnum		(vTHX->Iutf8_alnum)
 #define PL_utf8_alpha		(vTHX->Iutf8_alpha)

diff --git a/intrpvar.h b/intrpvar.h
@@ -626,7 +626,7 @@ PERLVAR(I, utf8_print,	SV *)
 PERLVAR(I, utf8_punct,	SV *)
 PERLVAR(I, utf8_xdigit,	SV *)
 PERLVAR(I, utf8_mark,	SV *)
-PERLVAR(I, utf8_X_begin, SV *)
+PERLVAR(I, utf8_X_regular_begin, SV *)
 PERLVAR(I, utf8_X_extend, SV *)
 PERLVAR(I, utf8_X_prepend, SV *)
 PERLVAR(I, utf8_X_special_begin, SV *)

diff --git a/lib/unicore/mktables b/lib/unicore/mktables
@@ -13512,6 +13512,8 @@ sub compile_perl() {
     #   | Prepend* Begin Extend*
     #   | .
     # Begin is:           ( Special_Begin | ! Control )
+    # Begin is also:      ( Regular_Begin | Special_Begin )
+    #   where Regular_Begin is defined as ( ! Control - Special_Begin )
     # Special_Begin is:   ( Regional-Indicator+ | Hangul-syllable )
     # Extend is:          ( Grapheme_Extend | Spacing_Mark )
     # Control is:         [ GCB_Control CR LF ]
@@ -13558,16 +13560,17 @@ sequences that can begin an extended grapheme cluster.  They need special
 handling because of their complicated nature.
 END
     ));
-    my $begin = $perl->add_match_table('_X_Begin', Perl_Extension => 1,
+    my $regular_begin = $perl->add_match_table('_X_Regular_Begin',
+                                       Perl_Extension => 1,
                                        Fate => $INTERNAL_ONLY,
-                                       Initialize => $specials_begin
-                                                   + ~ $gcb->table('Control')
+                                       Initialize => ~ $gcb->table('Control')
+                                                   - $specials_begin
                                                    - $gcb->table('CR')
                                                    - $gcb->table('LF')
                                       );
-    $begin->add_comment(join_lines( <<END
+    $regular_begin->add_comment(join_lines( <<END
 For use in \\X; matches first character of anything that can begin an extended
-grapheme cluster.
+grapheme cluster, except those that require special handling.
 END
     ));
 

diff --git a/proto.h b/proto.h
@@ -1800,12 +1800,6 @@ PERL_CALLCONV bool	Perl_is_utf8_X_V(pTHX_ const U8 *p)
 #define PERL_ARGS_ASSERT_IS_UTF8_X_V	\
 	assert(p)
 
-PERL_CALLCONV bool	Perl_is_utf8_X_begin(pTHX_ const U8 *p)
-			__attribute__warn_unused_result__
-			__attribute__nonnull__(pTHX_1);
-#define PERL_ARGS_ASSERT_IS_UTF8_X_BEGIN	\
-	assert(p)
-
 PERL_CALLCONV bool	Perl_is_utf8_X_extend(pTHX_ const U8 *p)
 			__attribute__warn_unused_result__
 			__attribute__nonnull__(pTHX_1);
@@ -1818,6 +1812,12 @@ PERL_CALLCONV bool	Perl_is_utf8_X_prepend(pTHX_ const U8 *p)
 #define PERL_ARGS_ASSERT_IS_UTF8_X_PREPEND	\
 	assert(p)
 
+PERL_CALLCONV bool	Perl_is_utf8_X_regular_begin(pTHX_ const U8 *p)
+			__attribute__warn_unused_result__
+			__attribute__nonnull__(pTHX_1);
+#define PERL_ARGS_ASSERT_IS_UTF8_X_REGULAR_BEGIN	\
+	assert(p)
+
 PERL_CALLCONV bool	Perl_is_utf8_X_special_begin(pTHX_ const U8 *p)
 			__attribute__warn_unused_result__
 			__attribute__nonnull__(pTHX_1);

diff --git a/regexec.c b/regexec.c
@@ -144,7 +144,7 @@
 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
         /* No asserts are done for some of these, in case called on a   */  \
         /* Unicode version in which they map to nothing */                  \
-	LOAD_UTF8_CHARCLASS(X_begin, HYPHEN_UTF8);                          \
+	LOAD_UTF8_CHARCLASS(X_regular_begin, HYPHEN_UTF8);                          \
 	LOAD_UTF8_CHARCLASS_NO_CHECK(X_special_begin);                      \
 	LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8);         \
 	LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* empty in most releases*/ \
@@ -3922,6 +3922,15 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                Control is:         [ GCB_Control  CR  LF ]
                Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
 
+               If we create a 'Regular_Begin' = Begin - Special_Begin, then
+               we can rewrite
+
+                   Begin is ( Regular_Begin + Special Begin )
+
+               It turns out that 98.4% of all Unicode code points match
+               Regular_Begin.  Doing it this way eliminates a table match in
+               the previouls implementation for almost all Unicode code points.
+
 	       There is a subtlety with Prepend* which showed up in testing.
 	       Note that the Begin, and only the Begin is required in:
 	        | Prepend* Begin Extend*
@@ -3977,7 +3986,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
 		     * matched, as it is guaranteed to match the begin */
 		    if (previous_prepend
 			&& (locinput >=  PL_regeol
-			    || ! swash_fetch(PL_utf8_X_begin,
+			    || ! swash_fetch(PL_utf8_X_regular_begin,
 					     (U8*)locinput, utf8_target)))
 		    {
 			locinput = previous_prepend;
@@ -3988,27 +3997,20 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
 		     * moved locinput forward, we tested the result just above
 		     * and it either passed, or we backed off so that it will
 		     * now pass */
-		    if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, utf8_target)) {
+		    if (swash_fetch(PL_utf8_X_regular_begin, (U8*)locinput, utf8_target)) {
+                        locinput += UTF8SKIP(locinput);
+                    }
+                    else if (! swash_fetch(PL_utf8_X_special_begin,
+					(U8*)locinput, utf8_target))
+			{
 
 			/* Here did not match the required 'Begin' in the
 			 * second term.  So just match the very first
 			 * character, the '.' of the final term of the regex */
 			locinput = starting + UTF8SKIP(starting);
+                        goto exit_utf8;
 		    } else {
 
-			/* Here is the beginning of a character that can have
-                         * an extender.  It is either a special begin character
-                         * that requires complicated handling, or a non-control
-                         * */
-			if (! swash_fetch(PL_utf8_X_special_begin,
-					(U8*)locinput, utf8_target))
-			{
-
-			    /* Here not a special begin, must be a
-			     * ('!  * Control') */
-			    locinput += UTF8SKIP(locinput);
-			} else {
-
 			    /* Here is a special begin.  It can be composed
                              * of several individual characters.  One
                              * possibility is RI+ */
@@ -4094,8 +4096,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
 			{
 			    locinput += UTF8SKIP(locinput);
 			}
-		    }
 		}
+            exit_utf8:
 		if (locinput > PL_regeol) sayNO;
 	    }
 	    nextchr = UCHARAT(locinput);

diff --git a/sv.c b/sv.c
@@ -13364,7 +13364,7 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags,
     PL_utf8_punct	= sv_dup_inc(proto_perl->Iutf8_punct, param);
     PL_utf8_xdigit	= sv_dup_inc(proto_perl->Iutf8_xdigit, param);
     PL_utf8_mark	= sv_dup_inc(proto_perl->Iutf8_mark, param);
-    PL_utf8_X_begin	= sv_dup_inc(proto_perl->Iutf8_X_begin, param);
+    PL_utf8_X_regular_begin	= sv_dup_inc(proto_perl->Iutf8_X_regular_begin, param);
     PL_utf8_X_extend	= sv_dup_inc(proto_perl->Iutf8_X_extend, param);
     PL_utf8_X_prepend	= sv_dup_inc(proto_perl->Iutf8_X_prepend, param);
     PL_utf8_X_special_begin	= sv_dup_inc(proto_perl->Iutf8_X_special_begin, param);

diff --git a/utf8.c b/utf8.c
@@ -2210,13 +2210,13 @@ Perl_is_utf8_mark(pTHX_ const U8 *p)
 }
 
 bool
-Perl_is_utf8_X_begin(pTHX_ const U8 *p)
+Perl_is_utf8_X_regular_begin(pTHX_ const U8 *p)
 {
     dVAR;
 
-    PERL_ARGS_ASSERT_IS_UTF8_X_BEGIN;
+    PERL_ARGS_ASSERT_IS_UTF8_X_REGULAR_BEGIN;
 
-    return is_utf8_common(p, &PL_utf8_X_begin, "_X_Begin");
+    return is_utf8_common(p, &PL_utf8_X_regular_begin, "_X_Regular_Begin");
 }
 
 bool