Skip to content

Commit

Permalink
Refactor \X regex handling to avoid a typical case table lookup
Browse files Browse the repository at this point in the history
Prior to this commit 98.4% of Unicode code points that went through \X
had to be looked up to see if they begin a grapheme cluster; then looked
up again to find that they didn't require special handling.  This commit
refactors things so only one look-up is required for those 98.4%.  It
changes the table generated by mktables to accomplish this, and hence
the name of it, and references to it are changed to correspond.
  • Loading branch information
Karl Williamson committed Aug 28, 2012
1 parent 9914684 commit 27d4fc3
Show file tree
Hide file tree
Showing 9 changed files with 41 additions and 36 deletions.
2 changes: 1 addition & 1 deletion embed.fnc
Original file line number Diff line number Diff line change
Expand Up @@ -660,9 +660,9 @@ ApR |bool |is_utf8_print |NN const U8 *p
ApR |bool |is_utf8_punct |NN const U8 *p
ApR |bool |is_utf8_xdigit |NN const U8 *p
ApR |bool |is_utf8_mark |NN const U8 *p
EXpR |bool |is_utf8_X_begin |NN const U8 *p
EXpR |bool |is_utf8_X_extend |NN const U8 *p
EXpR |bool |is_utf8_X_prepend |NN const U8 *p
EXpR |bool |is_utf8_X_regular_begin|NN const U8 *p
EXpR |bool |is_utf8_X_special_begin|NN const U8 *p
EXpR |bool |is_utf8_X_L |NN const U8 *p
EXpR |bool |is_utf8_X_RI |NN const U8 *p
Expand Down
2 changes: 1 addition & 1 deletion embed.h
Original file line number Diff line number Diff line change
Expand Up @@ -861,9 +861,9 @@
#define is_utf8_X_RI(a) Perl_is_utf8_X_RI(aTHX_ a)
#define is_utf8_X_T(a) Perl_is_utf8_X_T(aTHX_ a)
#define is_utf8_X_V(a) Perl_is_utf8_X_V(aTHX_ a)
#define is_utf8_X_begin(a) Perl_is_utf8_X_begin(aTHX_ a)
#define is_utf8_X_extend(a) Perl_is_utf8_X_extend(aTHX_ a)
#define is_utf8_X_prepend(a) Perl_is_utf8_X_prepend(aTHX_ a)
#define is_utf8_X_regular_begin(a) Perl_is_utf8_X_regular_begin(aTHX_ a)
#define is_utf8_X_special_begin(a) Perl_is_utf8_X_special_begin(aTHX_ a)
#define op_clear(a) Perl_op_clear(aTHX_ a)
#define qerror(a) Perl_qerror(aTHX_ a)
Expand Down
2 changes: 1 addition & 1 deletion embedvar.h
Original file line number Diff line number Diff line change
Expand Up @@ -359,9 +359,9 @@
#define PL_utf8_X_RI (vTHX->Iutf8_X_RI)
#define PL_utf8_X_T (vTHX->Iutf8_X_T)
#define PL_utf8_X_V (vTHX->Iutf8_X_V)
#define PL_utf8_X_begin (vTHX->Iutf8_X_begin)
#define PL_utf8_X_extend (vTHX->Iutf8_X_extend)
#define PL_utf8_X_prepend (vTHX->Iutf8_X_prepend)
#define PL_utf8_X_regular_begin (vTHX->Iutf8_X_regular_begin)
#define PL_utf8_X_special_begin (vTHX->Iutf8_X_special_begin)
#define PL_utf8_alnum (vTHX->Iutf8_alnum)
#define PL_utf8_alpha (vTHX->Iutf8_alpha)
Expand Down
2 changes: 1 addition & 1 deletion intrpvar.h
Original file line number Diff line number Diff line change
Expand Up @@ -626,7 +626,7 @@ PERLVAR(I, utf8_print, SV *)
PERLVAR(I, utf8_punct, SV *)
PERLVAR(I, utf8_xdigit, SV *)
PERLVAR(I, utf8_mark, SV *)
PERLVAR(I, utf8_X_begin, SV *)
PERLVAR(I, utf8_X_regular_begin, SV *)
PERLVAR(I, utf8_X_extend, SV *)
PERLVAR(I, utf8_X_prepend, SV *)
PERLVAR(I, utf8_X_special_begin, SV *)
Expand Down
13 changes: 8 additions & 5 deletions lib/unicore/mktables
Original file line number Diff line number Diff line change
Expand Up @@ -13512,6 +13512,8 @@ sub compile_perl() {
# | Prepend* Begin Extend*
# | .
# Begin is: ( Special_Begin | ! Control )
# Begin is also: ( Regular_Begin | Special_Begin )
# where Regular_Begin is defined as ( ! Control - Special_Begin )
# Special_Begin is: ( Regional-Indicator+ | Hangul-syllable )
# Extend is: ( Grapheme_Extend | Spacing_Mark )
# Control is: [ GCB_Control CR LF ]
Expand Down Expand Up @@ -13558,16 +13560,17 @@ sequences that can begin an extended grapheme cluster. They need special
handling because of their complicated nature.
END
));
my $begin = $perl->add_match_table('_X_Begin', Perl_Extension => 1,
my $regular_begin = $perl->add_match_table('_X_Regular_Begin',
Perl_Extension => 1,
Fate => $INTERNAL_ONLY,
Initialize => $specials_begin
+ ~ $gcb->table('Control')
Initialize => ~ $gcb->table('Control')
- $specials_begin
- $gcb->table('CR')
- $gcb->table('LF')
);
$begin->add_comment(join_lines( <<END
$regular_begin->add_comment(join_lines( <<END
For use in \\X; matches first character of anything that can begin an extended
grapheme cluster.
grapheme cluster, except those that require special handling.
END
));

Expand Down
12 changes: 6 additions & 6 deletions proto.h
Original file line number Diff line number Diff line change
Expand Up @@ -1800,12 +1800,6 @@ PERL_CALLCONV bool Perl_is_utf8_X_V(pTHX_ const U8 *p)
#define PERL_ARGS_ASSERT_IS_UTF8_X_V \
assert(p)

PERL_CALLCONV bool Perl_is_utf8_X_begin(pTHX_ const U8 *p)
__attribute__warn_unused_result__
__attribute__nonnull__(pTHX_1);
#define PERL_ARGS_ASSERT_IS_UTF8_X_BEGIN \
assert(p)

PERL_CALLCONV bool Perl_is_utf8_X_extend(pTHX_ const U8 *p)
__attribute__warn_unused_result__
__attribute__nonnull__(pTHX_1);
Expand All @@ -1818,6 +1812,12 @@ PERL_CALLCONV bool Perl_is_utf8_X_prepend(pTHX_ const U8 *p)
#define PERL_ARGS_ASSERT_IS_UTF8_X_PREPEND \
assert(p)

PERL_CALLCONV bool Perl_is_utf8_X_regular_begin(pTHX_ const U8 *p)
__attribute__warn_unused_result__
__attribute__nonnull__(pTHX_1);
#define PERL_ARGS_ASSERT_IS_UTF8_X_REGULAR_BEGIN \
assert(p)

PERL_CALLCONV bool Perl_is_utf8_X_special_begin(pTHX_ const U8 *p)
__attribute__warn_unused_result__
__attribute__nonnull__(pTHX_1);
Expand Down
36 changes: 19 additions & 17 deletions regexec.c
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@
#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
/* No asserts are done for some of these, in case called on a */ \
/* Unicode version in which they map to nothing */ \
LOAD_UTF8_CHARCLASS(X_begin, HYPHEN_UTF8); \
LOAD_UTF8_CHARCLASS(X_regular_begin, HYPHEN_UTF8); \
LOAD_UTF8_CHARCLASS_NO_CHECK(X_special_begin); \
LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8); \
LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* empty in most releases*/ \
Expand Down Expand Up @@ -3922,6 +3922,15 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
Control is: [ GCB_Control CR LF ]
Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
If we create a 'Regular_Begin' = Begin - Special_Begin, then
we can rewrite
Begin is ( Regular_Begin + Special Begin )
It turns out that 98.4% of all Unicode code points match
Regular_Begin. Doing it this way eliminates a table match in
the previouls implementation for almost all Unicode code points.
There is a subtlety with Prepend* which showed up in testing.
Note that the Begin, and only the Begin is required in:
| Prepend* Begin Extend*
Expand Down Expand Up @@ -3977,7 +3986,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
* matched, as it is guaranteed to match the begin */
if (previous_prepend
&& (locinput >= PL_regeol
|| ! swash_fetch(PL_utf8_X_begin,
|| ! swash_fetch(PL_utf8_X_regular_begin,
(U8*)locinput, utf8_target)))
{
locinput = previous_prepend;
Expand All @@ -3988,27 +3997,20 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
* moved locinput forward, we tested the result just above
* and it either passed, or we backed off so that it will
* now pass */
if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, utf8_target)) {
if (swash_fetch(PL_utf8_X_regular_begin, (U8*)locinput, utf8_target)) {
locinput += UTF8SKIP(locinput);
}
else if (! swash_fetch(PL_utf8_X_special_begin,
(U8*)locinput, utf8_target))
{

/* Here did not match the required 'Begin' in the
* second term. So just match the very first
* character, the '.' of the final term of the regex */
locinput = starting + UTF8SKIP(starting);
goto exit_utf8;
} else {

/* Here is the beginning of a character that can have
* an extender. It is either a special begin character
* that requires complicated handling, or a non-control
* */
if (! swash_fetch(PL_utf8_X_special_begin,
(U8*)locinput, utf8_target))
{

/* Here not a special begin, must be a
* ('! * Control') */
locinput += UTF8SKIP(locinput);
} else {

/* Here is a special begin. It can be composed
* of several individual characters. One
* possibility is RI+ */
Expand Down Expand Up @@ -4094,8 +4096,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
{
locinput += UTF8SKIP(locinput);
}
}
}
exit_utf8:
if (locinput > PL_regeol) sayNO;
}
nextchr = UCHARAT(locinput);
Expand Down
2 changes: 1 addition & 1 deletion sv.c
Original file line number Diff line number Diff line change
Expand Up @@ -13364,7 +13364,7 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags,
PL_utf8_punct = sv_dup_inc(proto_perl->Iutf8_punct, param);
PL_utf8_xdigit = sv_dup_inc(proto_perl->Iutf8_xdigit, param);
PL_utf8_mark = sv_dup_inc(proto_perl->Iutf8_mark, param);
PL_utf8_X_begin = sv_dup_inc(proto_perl->Iutf8_X_begin, param);
PL_utf8_X_regular_begin = sv_dup_inc(proto_perl->Iutf8_X_regular_begin, param);
PL_utf8_X_extend = sv_dup_inc(proto_perl->Iutf8_X_extend, param);
PL_utf8_X_prepend = sv_dup_inc(proto_perl->Iutf8_X_prepend, param);
PL_utf8_X_special_begin = sv_dup_inc(proto_perl->Iutf8_X_special_begin, param);
Expand Down
6 changes: 3 additions & 3 deletions utf8.c
Original file line number Diff line number Diff line change
Expand Up @@ -2210,13 +2210,13 @@ Perl_is_utf8_mark(pTHX_ const U8 *p)
}

bool
Perl_is_utf8_X_begin(pTHX_ const U8 *p)
Perl_is_utf8_X_regular_begin(pTHX_ const U8 *p)
{
dVAR;

PERL_ARGS_ASSERT_IS_UTF8_X_BEGIN;
PERL_ARGS_ASSERT_IS_UTF8_X_REGULAR_BEGIN;

return is_utf8_common(p, &PL_utf8_X_begin, "_X_Begin");
return is_utf8_common(p, &PL_utf8_X_regular_begin, "_X_Regular_Begin");
}

bool
Expand Down

0 comments on commit 27d4fc3

Please sign in to comment.