Skip to content

Commit 6670926

Browse files
committed
Do not break within certain combinations with Indic_Conjunct_Break (InCB)=Linker.
https://www.unicode.org/reports/tr29/tr29-43.html#GB9c
1 parent e63c516 commit 6670926

File tree

1 file changed

+57
-15
lines changed

1 file changed

+57
-15
lines changed

regparse.c

Lines changed: 57 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -5950,19 +5950,21 @@ create_node_from_array(int kind, Node **np, Node **node_array)
59505950
* nodes of the source to NULL_NODE, we can overlap the target array
59515951
* as long as we do not override the actual target location.
59525952
*
5953-
* Target Array name Index
5953+
* Target Array name Index
59545954
*
5955-
* node_array 0 1 2 3 4 5 6 7 8 9 A B C D E F
5956-
* top_alts alts[5] 0 1 2 3 4*
5957-
* alts+1 list[4] 0 1 2 3*
5958-
* list+1 core_alts[7] 0 1 2 3 4 5 6*
5959-
* core_alts+0 H_list[4] 0 1 2 3*
5960-
* H_list+1 H_alt2[4] 0 1 2 3*
5961-
* h_alt2+1 H_list2[3] 0 1 2*
5962-
* core_alts+4 XP_list[4] 0 1 2 3*
5963-
* XP_list+1 Ex_list[4] 0 1 2 3*
5955+
* node_array 0 1 2 3 4 5 6 7 8 9 A B C D E F G H
5956+
* top_alts alts[5] 0 1 2 3 4*
5957+
* alts+2 list[4] 0 1 2 3*
5958+
* list+1 core_alts[8] 0 1 2 3 4 5 6 7*
5959+
* core_alts+0 H_list[4] 0 1 2 3*
5960+
* H_list+1 H_alt2[4] 0 1 2 3*
5961+
* H_alt2+1 H_list2[3] 0 1 2*
5962+
* core_alts+4 XP_list[3] 0 1 2*
5963+
* XP_list+1 Ex_list[4] 0 1 2 3*
5964+
* core_alts+5 CC_list[3] 0 1 2*
5965+
* CC_list+1 CC_inner_list[5] 0 1 2 3 4*
59645966
*/
5965-
#define NODE_COMMON_SIZE 16
5967+
#define NODE_COMMON_SIZE 18
59665968

59675969
static int
59685970
node_extended_grapheme_cluster(Node** np, ScanEnv* env)
@@ -6029,9 +6031,10 @@ node_extended_grapheme_cluster(Node** np, ScanEnv* env)
60296031
/* core := hangul-syllable
60306032
* | ri-sequence
60316033
* | xpicto-sequence
6034+
* | conjunctCluster
60326035
* | [^Control CR LF] */
60336036
{
6034-
Node **core_alts = list + 2; /* size: 7 */
6037+
Node **core_alts = list + 2; /* size: 8 */
60356038

60366039
/* hangul-syllable :=
60376040
* L* (V+ | LV V* | LVT) T*
@@ -6099,10 +6102,49 @@ node_extended_grapheme_cluster(Node** np, ScanEnv* env)
60996102
R_ERR(create_node_from_array(LIST, core_alts+4, XP_list));
61006103
}
61016104

6105+
/* conjunctCluster := \p{InCB=Consonant} ([\p{InCB=Extend} \p{InCB=Linker}]* \p{InCB=Linker} [\p{InCB=Extend} \p{InCB=Linker}]* \p{InCB=Consonant})+ */
6106+
{
6107+
// \p{InCB=Consonant}
6108+
Node **CC_list = core_alts + 6; /* size: 3 */
6109+
R_ERR(create_property_node(CC_list+0, env, "InCB=Consonant"));
6110+
6111+
{
6112+
Node **CC_inner_list = CC_list + 2; /* size: 5 */
6113+
{
6114+
// [\p{InCB=Extend} \p{InCB=Linker}]*
6115+
R_ERR(create_property_node(CC_inner_list+0, env, "InCB=Extend"));
6116+
R_ERR(add_property_to_cc(NCCLASS(CC_inner_list[0]), "InCB=Linker", 0, env));
6117+
R_ERR(quantify_node(CC_inner_list+0, 0, REPEAT_INFINITE));
6118+
}
6119+
6120+
// \p{InCB=Linker}
6121+
R_ERR(create_property_node(CC_inner_list+1, env, "InCB=Linker"));
6122+
6123+
{
6124+
// [\p{InCB=Extend} \p{InCB=Linker}]*
6125+
R_ERR(create_property_node(CC_inner_list+2, env, "InCB=Extend"));
6126+
R_ERR(add_property_to_cc(NCCLASS(CC_inner_list[2]), "InCB=Linker", 0, env));
6127+
R_ERR(quantify_node(CC_inner_list+2, 0, REPEAT_INFINITE));
6128+
}
6129+
6130+
// \p{InCB=Consonant}
6131+
R_ERR(create_property_node(CC_inner_list+3, env, "InCB=Consonant"));
6132+
6133+
// ([\p{InCB=Extend} \p{InCB=Linker}]* \p{InCB=Linker} [\p{InCB=Extend} \p{InCB=Linker}]* \p{InCB=Consonant})
6134+
R_ERR(create_node_from_array(LIST, CC_list+1, CC_inner_list));
6135+
6136+
// (...)+
6137+
R_ERR(quantify_node(CC_list+1, 1, REPEAT_INFINITE));
6138+
}
6139+
6140+
// \p{InCB=Consonant} ([\p{InCB=Extend} \p{InCB=Linker}]* \p{InCB=Linker} [\p{InCB=Extend} \p{InCB=Linker}]* \p{InCB=Consonant})+
6141+
R_ERR(create_node_from_array(LIST, core_alts+5, CC_list));
6142+
}
6143+
61026144
/* [^Control CR LF] */
6103-
core_alts[5] = node_new_cclass();
6104-
if (IS_NULL(core_alts[5])) goto err;
6105-
cc = NCCLASS(core_alts[5]);
6145+
core_alts[6] = node_new_cclass();
6146+
if (IS_NULL(core_alts[6])) goto err;
6147+
cc = NCCLASS(core_alts[6]);
61066148
if (ONIGENC_MBC_MINLEN(env->enc) > 1) { /* UTF-16/UTF-32 */
61076149
BBuf *inverted_buf = NULL;
61086150

0 commit comments

Comments
 (0)