Skip to content

Commit ac5c06c

Browse files
committed
Remove staDFA algorithm.
Although it is an interesting algorithm, re2c benchmarks show that it performs worse than TDFA(1): on complex regular expressions staDFA are often much larger and slower than TDFA(1). See the benchmarks on the official re2c website: https://re2c.org/benchmarks/benchmarks.html, or the paper https://arxiv.org/abs/2206.01398 for details. StaDFA implementation adds complexity to the most complex part of re2c code -- determinization, and it takes some effort to maintain. The algorithm is not fully documented and formalized: the original paper by Chowdhury is incomplete, and the implementation is re2c had to use a few fixes and modifications to make it work. The benchmarks for staDFA now use re2c-3.0 to build.
1 parent ad7451a commit ac5c06c

File tree

596 files changed

+320
-34973
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

596 files changed

+320
-34973
lines changed

CMakeLists.txt

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,6 @@ set(re2c_sources
165165
src/dfa/fillpoints.cc
166166
src/dfa/find_state.cc
167167
src/dfa/minimization.cc
168-
src/dfa/stacmd.cc
169168
src/dfa/tagver_table.cc
170169
src/dfa/tcmd.cc
171170
src/encoding/ebcdic/ebcdic_regexp.cc
@@ -473,7 +472,6 @@ if (RE2C_BUILD_LIBS)
473472
src/dfa/fillpoints.cc
474473
src/dfa/find_state.cc
475474
src/dfa/minimization.cc
476-
src/dfa/stacmd.cc
477475
src/dfa/tagver_table.cc
478476
src/dfa/tcmd.cc
479477
src/nfa/estimate_size.cc

MAINTAINERS.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,9 @@ outside of lab (to the best of author's knowledge, that is):
1414
Libtool; remove a few internal options and a bunch of explicit template
1515
instantiations in TDFA construction code that exist only for libre2c.
1616

17-
- Sta-DFA, TDFA(0): removing these algorithms would clean up the code and make
18-
the difficult parts around TDFA construction easier to understand. These
19-
algorithms are slower than TDFA(1) and were added for experimental purposes.
17+
- TDFA(0): removing this algorithm would clean up the code and make the
18+
difficult parts around TDFA construction easier to understand. This algorithm
19+
is slower than TDFA(1) and was added for experimental purposes.
2020

2121
- One of the build systems: Autotools or CMake, whichever you like less. Someone
2222
will get unhappy in both cases (Windows is supported only by CMake, but distro

Makefile.am

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ re2c_HDR = \
3030
src/dfa/determinization.h \
3131
src/dfa/dfa.h \
3232
src/dfa/posix_precedence.h \
33-
src/dfa/stacmd.h \
3433
src/dfa/tag_history.h \
3534
src/dfa/tagver_table.h \
3635
src/dfa/tcmd.h \
@@ -120,7 +119,6 @@ re2c_SRC = \
120119
src/dfa/fillpoints.cc \
121120
src/dfa/find_state.cc \
122121
src/dfa/minimization.cc \
123-
src/dfa/stacmd.cc \
124122
src/dfa/tagver_table.cc \
125123
src/dfa/tcmd.cc \
126124
src/encoding/ebcdic/ebcdic_regexp.cc \

Makefile.lib.am

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ libre2c_la_HDR = \
2222
src/dfa/determinization.h \
2323
src/dfa/dfa.h \
2424
src/dfa/posix_precedence.h \
25-
src/dfa/stacmd.h \
2625
src/dfa/tag_history.h \
2726
src/dfa/tagver_table.h \
2827
src/dfa/tcmd.h \
@@ -111,7 +110,6 @@ libre2c_la_SRC = \
111110
src/dfa/fillpoints.cc \
112111
src/dfa/find_state.cc \
113112
src/dfa/minimization.cc \
114-
src/dfa/stacmd.cc \
115113
src/dfa/tagver_table.cc \
116114
src/dfa/tcmd.cc \
117115
src/nfa/estimate_size.cc \

benchmarks/submatch_dfa_aot/CMakeLists.txt

Lines changed: 38 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -125,8 +125,15 @@ foreach(bench ${BENCHMARKS})
125125
endforeach()
126126

127127
# re2c
128+
file(MAKE_DIRECTORY "${ENG_DIR}/re2c/")
128129
file(MAKE_DIRECTORY "${GEN_DIR}/re2c/")
129130
set(RE2C "${CMAKE_BINARY_DIR}/re2c")
131+
set(RE2C3 "${ENG_DIR}/re2c/re2c3")
132+
add_custom_command(
133+
OUTPUT "${RE2C3}"
134+
COMMAND "${CMAKE_CURRENT_SOURCE_DIR}/engines/re2c/getre2c3.sh"
135+
WORKING_DIRECTORY "${ENG_DIR}/re2c/"
136+
)
130137
set(RE2C_FLAGS "--reusable" "--tags" "--no-generation-date" "--no-version")
131138
set(COMMON_RE2C
132139
"${SRC_DIR}/re2c/common.re"
@@ -135,28 +142,44 @@ set(COMMON_RE2C
135142
"${SRC_DIR}/re2c/include-eof/fill.re"
136143
"${SRC_DIR}/re2c/include-eof/fill_email.re"
137144
)
145+
# deprecated algorithms were removed after re2c-3.0
146+
set(DEPRECATED_ALGS "stadfa")
138147
foreach(bench ${BENCHMARKS})
139148
foreach(eof "" "-eof")
140149
foreach(alg "tdfa1" "tdfa0" "stadfa")
141150
set(src_file "${SRC_DIR}/re2c/${bench}.re")
142151
set(gen_file "${GEN_DIR}/re2c/${bench}${eof}-${alg}.c")
143152
set(pregen_file "${PREGEN_DIR}/re2c/${bench}${eof}-${alg}.c")
144-
# always regenerate re2c benchmarks (regardless of RE2C_REGEN_BENCHMARKS)
145-
file(RELATIVE_PATH rel_src_file "${CMAKE_CURRENT_BINARY_DIR}" "${src_file}")
146-
file(RELATIVE_PATH rel_gen_file "${CMAKE_CURRENT_BINARY_DIR}" "${gen_file}")
147-
file(RELATIVE_PATH rel_inc_path "${CMAKE_CURRENT_BINARY_DIR}" "${SRC_DIR}/re2c/include${eof}")
148-
set(re2c_flags ${RE2C_FLAGS} "-I" "${rel_inc_path}")
149-
if("${alg}" STREQUAL "tdfa0")
150-
set(re2c_flags ${re2c_flags} "--no-lookahead")
151-
elseif("${alg}" STREQUAL "stadfa")
152-
set(re2c_flags ${re2c_flags} "--stadfa")
153+
# always regenerate re2c benchmarks, except for deprecated algorithms
154+
if(RE2C_REGEN_BENCHMARKS OR NOT (alg IN_LIST DEPRECATED_ALGS))
155+
# for deprecated algorithms use re2c-3.0
156+
if(alg IN_LIST DEPRECATED_ALGS)
157+
set(re2c_for_gen "${RE2C3}")
158+
else()
159+
set(re2c_for_gen "${RE2C}")
160+
endif()
161+
file(RELATIVE_PATH rel_src_file "${CMAKE_CURRENT_BINARY_DIR}" "${src_file}")
162+
file(RELATIVE_PATH rel_gen_file "${CMAKE_CURRENT_BINARY_DIR}" "${gen_file}")
163+
file(RELATIVE_PATH rel_inc_path "${CMAKE_CURRENT_BINARY_DIR}" "${SRC_DIR}/re2c/include${eof}")
164+
set(re2c_flags ${RE2C_FLAGS} "-I" "${rel_inc_path}")
165+
if("${alg}" STREQUAL "tdfa0")
166+
set(re2c_flags ${re2c_flags} "--no-lookahead")
167+
elseif("${alg}" STREQUAL "stadfa")
168+
set(re2c_flags ${re2c_flags} "--stadfa")
169+
endif()
170+
add_custom_command(
171+
OUTPUT "${gen_file}"
172+
COMMAND "${re2c_for_gen}" ${re2c_flags} "${rel_src_file}" -o "${rel_gen_file}"
173+
COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${gen_file}" "${pregen_file}"
174+
DEPENDS "${src_file}" ${COMMON_RE2C} "${re2c_for_gen}"
175+
)
176+
else()
177+
add_custom_command(
178+
OUTPUT "${gen_file}"
179+
COMMAND "${CMAKE_COMMAND}" -E copy "${pregen_file}" "${gen_file}"
180+
DEPENDS "${pregen_file}"
181+
)
153182
endif()
154-
add_custom_command(
155-
OUTPUT "${gen_file}"
156-
COMMAND "${RE2C}" ${re2c_flags} "${rel_src_file}" -o "${rel_gen_file}"
157-
COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${gen_file}" "${pregen_file}"
158-
DEPENDS "${src_file}" ${COMMON_RE2C} "${RE2C}"
159-
)
160183
list(APPEND GEN "${gen_file}")
161184
endforeach()
162185
endforeach()

benchmarks/submatch_dfa_aot/Makefile.am

Lines changed: 29 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ DAT_DIR = data
77
CFLAGS = -O3 -I $(SRC_DIR)
88
RAGEL = $(ENG_DIR)/ragel/ragel7
99
KLEENEX = $(ENG_DIR)/kleenex/kexc
10+
RE2C3 = $(ENG_DIR)/re2c/re2c3
1011
RE2C = $(top_builddir)/re2c
1112
RE2C_FLAGS_COMMON = --reusable --tags --no-generation-date --no-version
1213
RE2C_FLAGS = $(RE2C_FLAGS_COMMON) -I $(SRC_DIR)/re2c/include
@@ -60,21 +61,23 @@ BIN_RAGEL = $(patsubst $(GEN_DIR)%.c, $(BIN_DIR)%, $(GEN_RAGEL))
6061

6162
COMMON_RAGEL = $(COMMON_SRC) $(SRC_DIR)/ragel/common.c
6263

63-
GEN_RE2C_TDFA1 = $(patsubst %, $(GEN_DIR)/re2c/%-tdfa1.c, $(BENCHMARKS))
64-
GEN_RE2C_TDFA0 = $(patsubst %, $(GEN_DIR)/re2c/%-tdfa0.c, $(BENCHMARKS))
65-
GEN_RE2C_STADFA = $(patsubst %, $(GEN_DIR)/re2c/%-stadfa.c, $(BENCHMARKS))
66-
GEN_RE2C_EOF_TDFA1 = $(patsubst %, $(GEN_DIR)/re2c/%-eof-tdfa1.c, $(BENCHMARKS))
67-
GEN_RE2C_EOF_TDFA0 = $(patsubst %, $(GEN_DIR)/re2c/%-eof-tdfa0.c, $(BENCHMARKS))
68-
GEN_RE2C_EOF_STADFA = $(patsubst %, $(GEN_DIR)/re2c/%-eof-stadfa.c, $(BENCHMARKS))
64+
GEN_RE2C_TDFA1 = $(patsubst %, $(GEN_DIR)/re2c/%-tdfa1.c, $(BENCHMARKS))
65+
GEN_RE2C_TDFA0 = $(patsubst %, $(GEN_DIR)/re2c/%-tdfa0.c, $(BENCHMARKS))
66+
GEN_RE2C_EOF_TDFA1 = $(patsubst %, $(GEN_DIR)/re2c/%-eof-tdfa1.c, $(BENCHMARKS))
67+
GEN_RE2C_EOF_TDFA0 = $(patsubst %, $(GEN_DIR)/re2c/%-eof-tdfa0.c, $(BENCHMARKS))
6968
GEN_RE2C = \
7069
$(GEN_RE2C_TDFA1) \
7170
$(GEN_RE2C_TDFA0) \
72-
$(GEN_RE2C_STADFA) \
7371
$(GEN_RE2C_EOF_TDFA1) \
74-
$(GEN_RE2C_EOF_TDFA0) \
75-
$(GEN_RE2C_EOF_STADFA)
72+
$(GEN_RE2C_EOF_TDFA0)
7673

77-
BIN_RE2C = $(patsubst $(GEN_DIR)%.c, $(BIN_DIR)%, $(GEN_RE2C))
74+
GEN_RE2C3_STADFA = $(patsubst %, $(GEN_DIR)/re2c/%-stadfa.c, $(BENCHMARKS))
75+
GEN_RE2C3_EOF_STADFA = $(patsubst %, $(GEN_DIR)/re2c/%-eof-stadfa.c, $(BENCHMARKS))
76+
GEN_RE2C3 = \
77+
$(GEN_RE2C3_STADFA) \
78+
$(GEN_RE2C3_EOF_STADFA)
79+
80+
BIN_RE2C = $(patsubst $(GEN_DIR)%.c, $(BIN_DIR)%, $(GEN_RE2C) $(GEN_RE2C3))
7881

7982
COMMON_RE2C = \
8083
$(COMMON_SRC) \
@@ -97,7 +100,7 @@ GEN_KLEENEX = $(patsubst %, $(GEN_DIR)/kleenex/%.c, \
97100

98101
BIN_KLEENEX = $(patsubst $(GEN_DIR)%.c, $(BIN_DIR)%, $(GEN_KLEENEX))
99102

100-
GEN = $(GEN_RAGEL) $(GEN_RE2C) $(GEN_KLEENEX)
103+
GEN = $(GEN_RAGEL) $(GEN_RE2C) $(GEN_RE2C3) $(GEN_KLEENEX)
101104

102105
BIN_ = $(BIN_RAGEL) $(BIN_RE2C) $(BIN_KLEENEX)
103106
BIN_GCC = $(patsubst %, %-gcc, $(BIN_))
@@ -113,7 +116,7 @@ DAT = $(patsubst %, $(DAT_DIR)/%/big, \
113116

114117
all-local: $(BIN) $(DAT)
115118

116-
# always regenerate re2c benchmarks
119+
# always regenerate re2c benchmarks (except for deprecated algorithms)
117120

118121
$(GEN_RE2C_TDFA1): $(GEN_DIR)/%-tdfa1.c: $(SRC_DIR)/%.re $(COMMON_RE2C) $(RE2C)
119122
$(AM_V_at)mkdir -p $(@D)
@@ -135,19 +138,21 @@ $(GEN_RE2C_EOF_TDFA0): $(GEN_DIR)/%-eof-tdfa0.c: $(SRC_DIR)/%.re $(COMMON_RE2C)
135138
$(AM_V_GEN)$(RE2C) $(RE2C_FLAGS_EOF) --no-lookahead $< -o $@
136139
$(AM_V_at)if ! cmp -s $@ $(PREGEN_DIR)/re2c/$(@F) ; then cp -f $@ $(PREGEN_DIR)/re2c/$(@F) ; fi
137140

138-
$(GEN_RE2C_STADFA): $(GEN_DIR)/%-stadfa.c: $(SRC_DIR)/%.re $(COMMON_RE2C) $(RE2C)
141+
# optionally regenerate ragel, kleenex and re2c-3.0 benchmarks
142+
# (staDFA algorithm was removed in re2c versions after 3.0)
143+
144+
if REGEN_BENCHMARKS
145+
146+
$(GEN_RE2C3_STADFA): $(GEN_DIR)/%-stadfa.c: $(SRC_DIR)/%.re $(COMMON_RE2C) $(RE2C3)
139147
$(AM_V_at)mkdir -p $(@D)
140-
$(AM_V_GEN)$(RE2C) $(RE2C_FLAGS) --stadfa $< -o $@
148+
$(AM_V_GEN)$(RE2C3) $(RE2C_FLAGS) --stadfa $< -o $@
141149
$(AM_V_at)if ! cmp -s $@ $(PREGEN_DIR)/re2c/$(@F) ; then cp -f $@ $(PREGEN_DIR)/re2c/$(@F) ; fi
142150

143-
$(GEN_RE2C_EOF_STADFA): $(GEN_DIR)/%-eof-stadfa.c: $(SRC_DIR)/%.re $(COMMON_RE2C) $(RE2C)
151+
$(GEN_RE2C3_EOF_STADFA): $(GEN_DIR)/%-eof-stadfa.c: $(SRC_DIR)/%.re $(COMMON_RE2C) $(RE2C3)
144152
$(AM_V_at)mkdir -p $(@D)
145-
$(AM_V_GEN)$(RE2C) $(RE2C_FLAGS_EOF) --stadfa $< -o $@
153+
$(AM_V_GEN)$(RE2C3) $(RE2C_FLAGS_EOF) --stadfa $< -o $@
146154
$(AM_V_at)if ! cmp -s $@ $(PREGEN_DIR)/re2c/$(@F) ; then cp -f $@ $(PREGEN_DIR)/re2c/$(@F) ; fi
147155

148-
# optionally regenerate ragel and kleenex benchmarks
149-
if REGEN_BENCHMARKS
150-
151156
$(GEN_RAGEL): $(GEN_DIR)/%.c: $(SRC_DIR)/%.rl $(COMMON_RAGEL) $(RAGEL)
152157
$(AM_V_at)mkdir -p $(@D)
153158
$(AM_V_GEN)$(RAGEL) -G2 $< -o $@
@@ -160,7 +165,7 @@ $(GEN_KLEENEX): $(GEN_DIR)/%.c: $(SRC_DIR)/%.kex $(KLEENEX)
160165

161166
else
162167

163-
$(GEN_RAGEL) $(GEN_KLEENEX): $(GEN_DIR)/%: $(PREGEN_DIR)/%
168+
$(GEN_RAGEL) $(GEN_RE2C3) $(GEN_KLEENEX): $(GEN_DIR)/%: $(PREGEN_DIR)/%
164169
$(AM_V_at)mkdir -p $(@D)
165170
$(AM_V_GEN)cp -f $< $@
166171

@@ -188,5 +193,9 @@ $(RAGEL):
188193
$(AM_V_at)mkdir -p $(@D) && cp $(srcdir)/engines/ragel/getragel7.sh $(@D)
189194
$(AM_V_GEN)( cd $(@D) && ./getragel7.sh )
190195

196+
$(RE2C3):
197+
$(AM_V_at)mkdir -p $(@D) && cp $(srcdir)/engines/re2c/getre2c3.sh $(@D)
198+
$(AM_V_GEN)( cd $(@D) && ./getre2c3.sh )
199+
191200
clean-local:
192201
$(AM_V_at)rm -f $(GEN) $(OBJ) $(BIN) $(DAT)
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
set -ex
2+
3+
if [ -e re2c3 ] ; then
4+
echo "re2c3 is already installed, quitting"
5+
exit 0
6+
fi
7+
8+
RE2C_DIR=re2c-3.0
9+
10+
wget https://github.com/skvadrik/re2c/releases/download/3.0/re2c-3.0.tar.xz \
11+
&& tar xf re2c-3.0.tar.xz \
12+
&& rm re2c-3.0.tar.xz \
13+
&& cd "$RE2C_DIR" \
14+
&& ./configure \
15+
--disable-golang \
16+
--disable-rust \
17+
--prefix=$(pwd)/install \
18+
&& make -j$(nproc) \
19+
&& make install \
20+
&& cd ..
21+
22+
ln -s "$RE2C_DIR"/install/bin/re2c re2c3

benchmarks/submatch_dfa_jit/CMakeLists.txt

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@ add_executable(bench_submatch_dfa_jit
33
"${CMAKE_SOURCE_DIR}/benchmarks/common/common.cc"
44
)
55

6-
set_property(TARGET bench_submatch_dfa_jit PROPERTY CXX_STANDARD 11)
7-
86
target_link_libraries(bench_submatch_dfa_jit libre2c)
97

108
find_package(benchmark REQUIRED)

benchmarks/submatch_dfa_jit/Makefile.am

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
bench_submatch_dfa_jit_CXXFLAGS = $(AM_CXXFLAGS) -std=c++11 -O3 -I $(top_srcdir)
1+
bench_submatch_dfa_jit_CXXFLAGS = $(AM_CXXFLAGS) -O3 -I $(top_srcdir)
22
bench_submatch_dfa_jit_LDFLAGS = $(LDFLAGS_RE2) -lbenchmark -lpthread
33
bench_submatch_dfa_jit_LDADD = $(top_builddir)/libre2c.la
44

benchmarks/submatch_nfa/CMakeLists.txt

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@ add_executable(bench_submatch_nfa
33
"${CMAKE_SOURCE_DIR}/benchmarks/common/common.cc"
44
)
55

6-
set_property(TARGET bench_submatch_nfa PROPERTY CXX_STANDARD 11)
7-
86
target_link_libraries(bench_submatch_nfa libre2c)
97

108
find_package(benchmark REQUIRED)

0 commit comments

Comments
 (0)