Skip to content
Browse files

initial import of upstream libre

From: cvs -d:pserver:anonymous@libre.cvs.sourceforge.net:/cvsroot/libre
  • Loading branch information...
0 parents commit 594c50761808dd42a5658e9901dc7d9acd65823e @avsm avsm committed Dec 21, 2011
Showing with 5,754 additions and 0 deletions.
  1. +12 −0 Changes
  2. +14 −0 INSTALL
  3. +504 −0 LICENSE
  4. +3 −0 META
  5. +68 −0 Makefile
  6. +70 −0 README
  7. +39 −0 TODO.txt
  8. +657 −0 automata.ml
  9. +78 −0 automata.mli
  10. +120 −0 cset.ml
  11. +23 −0 cset.mli
  12. +24 −0 depend
  13. +953 −0 re.ml
  14. +127 −0 re.mli
  15. +122 −0 re_emacs.ml
  16. +32 −0 re_emacs.mli
  17. +138 −0 re_glob.ml
  18. +32 −0 re_glob.mli
  19. +226 −0 re_perl.ml
  20. +36 −0 re_perl.mli
  21. +154 −0 re_posix.ml
  22. +74 −0 re_posix.mli
  23. +279 −0 re_str.ml
  24. +187 −0 re_str.mli
  25. +9 −0 tests/.cvsignore
  26. +19 −0 tests/CVS/Entries
  27. +1 −0 tests/CVS/Repository
  28. +1 −0 tests/CVS/Root
  29. +36 −0 tests/Input
  30. +24 −0 tests/Makefile
  31. +20 −0 tests/env.ml
  32. +38 −0 tests/longest.c
  33. +42 −0 tests/pcre_match.ml
  34. +8 −0 tests/pcre_scan.ml
  35. +4 −0 tests/perl_scan.pl
  36. +56 −0 tests/re_match.ml
  37. +10 −0 tests/re_scan.ml
  38. +150 −0 tests/scan.ml
  39. +95 −0 tests/test_emacs.ml
  40. +161 −0 tests/test_perl.ml
  41. +397 −0 tests/test_re.ml
  42. +164 −0 tests/test_str.ml
  43. +180 −0 tests/unison.ml
  44. +163 −0 tests/unison2.ml
  45. +204 −0 tests/unison3.ml
12 Changes
@@ -0,0 +1,12 @@
+- Improved API for accessing substring information.
+- The search can now be bounded to a given length.
+- The function "execp" returns a boolean indicating whether the match
+ was successful.
+- The "leol" assertion is fully implemented.
+- The "stop" assertion matches the end of the searched part of the
+ string.
+- "nest" operator: when matching against "nest e", only the group
+ contained in the last match of e will be considered as matching.
+- The semantics of nested matches in Posix regular expressions
+ now follows the standard.
+- Str-compatibility interface
14 INSTALL
@@ -0,0 +1,14 @@
+
+Requirements
+
+ The installation procedure defined in the Makefile requires findlib
+ (http://www.ocaml-programming.de/packages/documentation/findlib/).
+
+Installation
+
+- Compile with "make all".
+
+- If you have ocamlopt, do also "make opt".
+
+- Become super-user if necessary and do "make install"
+ (A "make uninstall" removes the library.)
504 LICENSE
@@ -0,0 +1,504 @@
+ GNU LESSER GENERAL PUBLIC LICENSE
+ Version 2.1, February 1999
+
+ Copyright (C) 1991, 1999 Free Software Foundation, Inc.
+ 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+[This is the first released version of the Lesser GPL. It also counts
+ as the successor of the GNU Library Public License, version 2, hence
+ the version number 2.1.]
+
+ Preamble
+
+ The licenses for most software are designed to take away your
+freedom to share and change it. By contrast, the GNU General Public
+Licenses are intended to guarantee your freedom to share and change
+free software--to make sure the software is free for all its users.
+
+ This license, the Lesser General Public License, applies to some
+specially designated software packages--typically libraries--of the
+Free Software Foundation and other authors who decide to use it. You
+can use it too, but we suggest you first think carefully about whether
+this license or the ordinary General Public License is the better
+strategy to use in any particular case, based on the explanations below.
+
+ When we speak of free software, we are referring to freedom of use,
+not price. Our General Public Licenses are designed to make sure that
+you have the freedom to distribute copies of free software (and charge
+for this service if you wish); that you receive source code or can get
+it if you want it; that you can change the software and use pieces of
+it in new free programs; and that you are informed that you can do
+these things.
+
+ To protect your rights, we need to make restrictions that forbid
+distributors to deny you these rights or to ask you to surrender these
+rights. These restrictions translate to certain responsibilities for
+you if you distribute copies of the library or if you modify it.
+
+ For example, if you distribute copies of the library, whether gratis
+or for a fee, you must give the recipients all the rights that we gave
+you. You must make sure that they, too, receive or can get the source
+code. If you link other code with the library, you must provide
+complete object files to the recipients, so that they can relink them
+with the library after making changes to the library and recompiling
+it. And you must show them these terms so they know their rights.
+
+ We protect your rights with a two-step method: (1) we copyright the
+library, and (2) we offer you this license, which gives you legal
+permission to copy, distribute and/or modify the library.
+
+ To protect each distributor, we want to make it very clear that
+there is no warranty for the free library. Also, if the library is
+modified by someone else and passed on, the recipients should know
+that what they have is not the original version, so that the original
+author's reputation will not be affected by problems that might be
+introduced by others.
+
+ Finally, software patents pose a constant threat to the existence of
+any free program. We wish to make sure that a company cannot
+effectively restrict the users of a free program by obtaining a
+restrictive license from a patent holder. Therefore, we insist that
+any patent license obtained for a version of the library must be
+consistent with the full freedom of use specified in this license.
+
+ Most GNU software, including some libraries, is covered by the
+ordinary GNU General Public License. This license, the GNU Lesser
+General Public License, applies to certain designated libraries, and
+is quite different from the ordinary General Public License. We use
+this license for certain libraries in order to permit linking those
+libraries into non-free programs.
+
+ When a program is linked with a library, whether statically or using
+a shared library, the combination of the two is legally speaking a
+combined work, a derivative of the original library. The ordinary
+General Public License therefore permits such linking only if the
+entire combination fits its criteria of freedom. The Lesser General
+Public License permits more lax criteria for linking other code with
+the library.
+
+ We call this license the "Lesser" General Public License because it
+does Less to protect the user's freedom than the ordinary General
+Public License. It also provides other free software developers Less
+of an advantage over competing non-free programs. These disadvantages
+are the reason we use the ordinary General Public License for many
+libraries. However, the Lesser license provides advantages in certain
+special circumstances.
+
+ For example, on rare occasions, there may be a special need to
+encourage the widest possible use of a certain library, so that it becomes
+a de-facto standard. To achieve this, non-free programs must be
+allowed to use the library. A more frequent case is that a free
+library does the same job as widely used non-free libraries. In this
+case, there is little to gain by limiting the free library to free
+software only, so we use the Lesser General Public License.
+
+ In other cases, permission to use a particular library in non-free
+programs enables a greater number of people to use a large body of
+free software. For example, permission to use the GNU C Library in
+non-free programs enables many more people to use the whole GNU
+operating system, as well as its variant, the GNU/Linux operating
+system.
+
+ Although the Lesser General Public License is Less protective of the
+users' freedom, it does ensure that the user of a program that is
+linked with the Library has the freedom and the wherewithal to run
+that program using a modified version of the Library.
+
+ The precise terms and conditions for copying, distribution and
+modification follow. Pay close attention to the difference between a
+"work based on the library" and a "work that uses the library". The
+former contains code derived from the library, whereas the latter must
+be combined with the library in order to run.
+
+ GNU LESSER GENERAL PUBLIC LICENSE
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+ 0. This License Agreement applies to any software library or other
+program which contains a notice placed by the copyright holder or
+other authorized party saying it may be distributed under the terms of
+this Lesser General Public License (also called "this License").
+Each licensee is addressed as "you".
+
+ A "library" means a collection of software functions and/or data
+prepared so as to be conveniently linked with application programs
+(which use some of those functions and data) to form executables.
+
+ The "Library", below, refers to any such software library or work
+which has been distributed under these terms. A "work based on the
+Library" means either the Library or any derivative work under
+copyright law: that is to say, a work containing the Library or a
+portion of it, either verbatim or with modifications and/or translated
+straightforwardly into another language. (Hereinafter, translation is
+included without limitation in the term "modification".)
+
+ "Source code" for a work means the preferred form of the work for
+making modifications to it. For a library, complete source code means
+all the source code for all modules it contains, plus any associated
+interface definition files, plus the scripts used to control compilation
+and installation of the library.
+
+ Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running a program using the Library is not restricted, and output from
+such a program is covered only if its contents constitute a work based
+on the Library (independent of the use of the Library in a tool for
+writing it). Whether that is true depends on what the Library does
+and what the program that uses the Library does.
+
+ 1. You may copy and distribute verbatim copies of the Library's
+complete source code as you receive it, in any medium, provided that
+you conspicuously and appropriately publish on each copy an
+appropriate copyright notice and disclaimer of warranty; keep intact
+all the notices that refer to this License and to the absence of any
+warranty; and distribute a copy of this License along with the
+Library.
+
+ You may charge a fee for the physical act of transferring a copy,
+and you may at your option offer warranty protection in exchange for a
+fee.
+
+ 2. You may modify your copy or copies of the Library or any portion
+of it, thus forming a work based on the Library, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+ a) The modified work must itself be a software library.
+
+ b) You must cause the files modified to carry prominent notices
+ stating that you changed the files and the date of any change.
+
+ c) You must cause the whole of the work to be licensed at no
+ charge to all third parties under the terms of this License.
+
+ d) If a facility in the modified Library refers to a function or a
+ table of data to be supplied by an application program that uses
+ the facility, other than as an argument passed when the facility
+ is invoked, then you must make a good faith effort to ensure that,
+ in the event an application does not supply such function or
+ table, the facility still operates, and performs whatever part of
+ its purpose remains meaningful.
+
+ (For example, a function in a library to compute square roots has
+ a purpose that is entirely well-defined independent of the
+ application. Therefore, Subsection 2d requires that any
+ application-supplied function or table used by this function must
+ be optional: if the application does not supply it, the square
+ root function must still compute square roots.)
+
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Library,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Library, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote
+it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Library.
+
+In addition, mere aggregation of another work not based on the Library
+with the Library (or with a work based on the Library) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+ 3. You may opt to apply the terms of the ordinary GNU General Public
+License instead of this License to a given copy of the Library. To do
+this, you must alter all the notices that refer to this License, so
+that they refer to the ordinary GNU General Public License, version 2,
+instead of to this License. (If a newer version than version 2 of the
+ordinary GNU General Public License has appeared, then you can specify
+that version instead if you wish.) Do not make any other change in
+these notices.
+
+ Once this change is made in a given copy, it is irreversible for
+that copy, so the ordinary GNU General Public License applies to all
+subsequent copies and derivative works made from that copy.
+
+ This option is useful when you wish to copy part of the code of
+the Library into a program that is not a library.
+
+ 4. You may copy and distribute the Library (or a portion or
+derivative of it, under Section 2) in object code or executable form
+under the terms of Sections 1 and 2 above provided that you accompany
+it with the complete corresponding machine-readable source code, which
+must be distributed under the terms of Sections 1 and 2 above on a
+medium customarily used for software interchange.
+
+ If distribution of object code is made by offering access to copy
+from a designated place, then offering equivalent access to copy the
+source code from the same place satisfies the requirement to
+distribute the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+ 5. A program that contains no derivative of any portion of the
+Library, but is designed to work with the Library by being compiled or
+linked with it, is called a "work that uses the Library". Such a
+work, in isolation, is not a derivative work of the Library, and
+therefore falls outside the scope of this License.
+
+ However, linking a "work that uses the Library" with the Library
+creates an executable that is a derivative of the Library (because it
+contains portions of the Library), rather than a "work that uses the
+library". The executable is therefore covered by this License.
+Section 6 states terms for distribution of such executables.
+
+ When a "work that uses the Library" uses material from a header file
+that is part of the Library, the object code for the work may be a
+derivative work of the Library even though the source code is not.
+Whether this is true is especially significant if the work can be
+linked without the Library, or if the work is itself a library. The
+threshold for this to be true is not precisely defined by law.
+
+ If such an object file uses only numerical parameters, data
+structure layouts and accessors, and small macros and small inline
+functions (ten lines or less in length), then the use of the object
+file is unrestricted, regardless of whether it is legally a derivative
+work. (Executables containing this object code plus portions of the
+Library will still fall under Section 6.)
+
+ Otherwise, if the work is a derivative of the Library, you may
+distribute the object code for the work under the terms of Section 6.
+Any executables containing that work also fall under Section 6,
+whether or not they are linked directly with the Library itself.
+
+ 6. As an exception to the Sections above, you may also combine or
+link a "work that uses the Library" with the Library to produce a
+work containing portions of the Library, and distribute that work
+under terms of your choice, provided that the terms permit
+modification of the work for the customer's own use and reverse
+engineering for debugging such modifications.
+
+ You must give prominent notice with each copy of the work that the
+Library is used in it and that the Library and its use are covered by
+this License. You must supply a copy of this License. If the work
+during execution displays copyright notices, you must include the
+copyright notice for the Library among them, as well as a reference
+directing the user to the copy of this License. Also, you must do one
+of these things:
+
+ a) Accompany the work with the complete corresponding
+ machine-readable source code for the Library including whatever
+ changes were used in the work (which must be distributed under
+ Sections 1 and 2 above); and, if the work is an executable linked
+ with the Library, with the complete machine-readable "work that
+ uses the Library", as object code and/or source code, so that the
+ user can modify the Library and then relink to produce a modified
+ executable containing the modified Library. (It is understood
+ that the user who changes the contents of definitions files in the
+ Library will not necessarily be able to recompile the application
+ to use the modified definitions.)
+
+ b) Use a suitable shared library mechanism for linking with the
+ Library. A suitable mechanism is one that (1) uses at run time a
+ copy of the library already present on the user's computer system,
+ rather than copying library functions into the executable, and (2)
+ will operate properly with a modified version of the library, if
+ the user installs one, as long as the modified version is
+ interface-compatible with the version that the work was made with.
+
+ c) Accompany the work with a written offer, valid for at
+ least three years, to give the same user the materials
+ specified in Subsection 6a, above, for a charge no more
+ than the cost of performing this distribution.
+
+ d) If distribution of the work is made by offering access to copy
+ from a designated place, offer equivalent access to copy the above
+ specified materials from the same place.
+
+ e) Verify that the user has already received a copy of these
+ materials or that you have already sent this user a copy.
+
+ For an executable, the required form of the "work that uses the
+Library" must include any data and utility programs needed for
+reproducing the executable from it. However, as a special exception,
+the materials to be distributed need not include anything that is
+normally distributed (in either source or binary form) with the major
+components (compiler, kernel, and so on) of the operating system on
+which the executable runs, unless that component itself accompanies
+the executable.
+
+ It may happen that this requirement contradicts the license
+restrictions of other proprietary libraries that do not normally
+accompany the operating system. Such a contradiction means you cannot
+use both them and the Library together in an executable that you
+distribute.
+
+ 7. You may place library facilities that are a work based on the
+Library side-by-side in a single library together with other library
+facilities not covered by this License, and distribute such a combined
+library, provided that the separate distribution of the work based on
+the Library and of the other library facilities is otherwise
+permitted, and provided that you do these two things:
+
+ a) Accompany the combined library with a copy of the same work
+ based on the Library, uncombined with any other library
+ facilities. This must be distributed under the terms of the
+ Sections above.
+
+ b) Give prominent notice with the combined library of the fact
+ that part of it is a work based on the Library, and explaining
+ where to find the accompanying uncombined form of the same work.
+
+ 8. You may not copy, modify, sublicense, link with, or distribute
+the Library except as expressly provided under this License. Any
+attempt otherwise to copy, modify, sublicense, link with, or
+distribute the Library is void, and will automatically terminate your
+rights under this License. However, parties who have received copies,
+or rights, from you under this License will not have their licenses
+terminated so long as such parties remain in full compliance.
+
+ 9. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Library or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Library (or any work based on the
+Library), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Library or works based on it.
+
+ 10. Each time you redistribute the Library (or any work based on the
+Library), the recipient automatically receives a license from the
+original licensor to copy, distribute, link with or modify the Library
+subject to these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties with
+this License.
+
+ 11. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Library at all. For example, if a patent
+license would not permit royalty-free redistribution of the Library by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Library.
+
+If any portion of this section is held invalid or unenforceable under any
+particular circumstance, the balance of the section is intended to apply,
+and the section as a whole is intended to apply in other circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+ 12. If the distribution and/or use of the Library is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Library under this License may add
+an explicit geographical distribution limitation excluding those countries,
+so that distribution is permitted only in or among countries not thus
+excluded. In such case, this License incorporates the limitation as if
+written in the body of this License.
+
+ 13. The Free Software Foundation may publish revised and/or new
+versions of the Lesser General Public License from time to time.
+Such new versions will be similar in spirit to the present version,
+but may differ in detail to address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Library
+specifies a version number of this License which applies to it and
+"any later version", you have the option of following the terms and
+conditions either of that version or of any later version published by
+the Free Software Foundation. If the Library does not specify a
+license version number, you may choose any version ever published by
+the Free Software Foundation.
+
+ 14. If you wish to incorporate parts of the Library into other free
+programs whose distribution conditions are incompatible with these,
+write to the author to ask for permission. For software which is
+copyrighted by the Free Software Foundation, write to the Free
+Software Foundation; we sometimes make exceptions for this. Our
+decision will be guided by the two goals of preserving the free status
+of all derivatives of our free software and of promoting the sharing
+and reuse of software generally.
+
+ NO WARRANTY
+
+ 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
+WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
+EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
+OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
+KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
+LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
+THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+ 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
+WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
+AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
+FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
+CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
+LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
+RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
+FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
+SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGES.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Libraries
+
+ If you develop a new library, and you want it to be of the greatest
+possible use to the public, we recommend making it free software that
+everyone can redistribute and change. You can do so by permitting
+redistribution under these terms (or, alternatively, under the terms of the
+ordinary General Public License).
+
+ To apply these terms, attach the following notices to the library. It is
+safest to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least the
+"copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the library's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+Also add information on how to contact you by electronic and paper mail.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the library, if
+necessary. Here is a sample; alter the names:
+
+ Yoyodyne, Inc., hereby disclaims all copyright interest in the
+ library `Frob' (a library for tweaking knobs) written by James Random Hacker.
+
+ <signature of Ty Coon>, 1 April 1990
+ Ty Coon, President of Vice
+
+That's all there is to it!
+
+
3 META
@@ -0,0 +1,3 @@
+version = "0.1"
+archive(byte) = "re.cma"
+archive(native) = "re.cmxa"
68 Makefile
@@ -0,0 +1,68 @@
+
+NAME = re
+
+OCAMLC = ocamlfind ocamlc -g
+OCAMLOPT = ocamlfind ocamlopt -unsafe
+OCAMLDEP = ocamldep
+
+INCFLAGS =
+OBJECTS = cset.cmo automata.cmo \
+ re.cmo re_posix.cmo re_emacs.cmo re_perl.cmo re_glob.cmo re_str.cmo
+XOBJECTS = $(OBJECTS:cmo=cmx)
+INTFS = re.mli re_posix.mli re_emacs.mli re_perl.mli re_glob.mli re_str.mli
+
+ARCHIVE = $(NAME).cma
+XARCHIVE = $(NAME).cmxa
+
+REQUIRES =
+PREDICATES =
+
+all: $(ARCHIVE)
+opt: $(XARCHIVE)
+
+$(ARCHIVE): $(OBJECTS)
+ $(OCAMLC) -a -o $(ARCHIVE) -package "$(REQUIRES)" -linkpkg \
+ -predicates "$(PREDICATES)" $(OBJECTS)
+$(XARCHIVE): $(XOBJECTS)
+ $(OCAMLOPT) -a -o $(XARCHIVE) -package "$(REQUIRES)" -linkpkg \
+ -predicates "$(PREDICATES)" $(XOBJECTS)
+
+.SUFFIXES: .cmo .cmi .cmx .ml .mli
+
+.ml.cmo:
+ $(OCAMLC) -package "$(REQUIRES)" -predicates "$(PREDICATES)" \
+ $(INCFLAGS) -c $<
+.mli.cmi:
+ $(OCAMLC) -package "$(REQUIRES)" -predicates "$(PREDICATES)" \
+ $(INCFLAGS) -c $<
+.ml.cmx:
+ $(OCAMLOPT) -package "$(REQUIRES)" -predicates "$(PREDICATES)" \
+ $(INCFLAGS) -c $<
+
+depend: *.ml *.mli
+ $(OCAMLDEP) $(INCFLAGS) *.ml *.mli util/*.ml util/*.mli > depend
+include depend
+
+install: all
+ { test ! -f $(XARCHIVE) || extra="$(XARCHIVE) "`basename $(XARCHIVE) .cmxa`.a; }; \
+ ocamlfind install $(NAME) $(INTFS) $(INTFS:mli=cmi) $(ARCHIVE) META $$extra
+
+uninstall:
+ ocamlfind remove $(NAME)
+
+clean::
+ rm -f *.cmi *.cmo *.cmx *.cma *.cmxa *.a *.o
+ rm -f util/*.cmi util/*.cmo util/*.cmx util/*.o
+
+clean::
+ cd tests; make clean
+
+realclean: clean
+ rm -f *~ util/*~
+
+distrib: realclean
+ cd ..; tar zcvf re.tar.gz --exclude CVS re
+
+check: $(ARCHIVE)
+ fort $(ARCHIVE) -env tests/env.ml \
+ tests/test_re.ml tests/test_emacs.ml tests/test_perl.ml
70 README
@@ -0,0 +1,70 @@
+
+DESCRIPTION
+===========
+
+RE is a regular expression library for OCaml. It is still under
+developpement, but is already rather usable.
+
+CONTACT
+=======
+
+This library has been written by Jerome Vouillon (Jerome.Vouillon@inria.fr).
+It can be downloaded from http://libre.sourceforge.net
+
+Bug reports, suggestions and contributions are welcome.
+
+FEATURES
+========
+
+The following styles of regular expressions are supported:
+- Perl-style regular expressions (module Re_perl);
+- Posix extended regular expressions (module Re_posix);
+- Emacs-style regular expressions (module Re_emacs);
+- Shell-style file globbing (module Re_glob).
+
+It is also possible to build regular expressions by combining simpler
+regular expressions (module Re)
+
+The most notable missing features are back-references and
+look-ahead/look-behind assertions.
+
+PERFORMANCES
+============
+
+The matches are performed by lazily building a DFA (deterministic
+finite automata) from the regular expression. As a consequence,
+matching takes linear time in the length of the matched string.
+
+The compilation of patterns is slower than with libraries using
+back-tacking, such as PCRE. But, once a large enough part of the
+DFA is built, matching is extremely fast.
+
+Of course, for some combinations of regular expression and string, the
+part of the DFA that needs to be build is so large that this point is
+never reached, and matching will be slow. This is not expected to
+happen often in practice, and actually a lot of expressions that
+behaves badly with a backtracking implementation are very efficient
+with this implementation.
+
+The library is at the moment entirely written in OCaml. As a
+consequence, regular expression matching is much slower when the
+library is compiled to bytecode than when it is compiled to native
+code.
+
+Here are some timing results (Pentium III 500Mhz):
+* Scanning a 1Mb string containing only 'a's, except for the last
+ character which is a 'b', searching for the pattern "aa?b"
+ (repeated 100 times).
+ - RE: 2.6s
+ - PCRE: 68s
+* Regular expression example from http://www.bagley.org/~doug/shootout/.
+ - RE: 0.43s
+ - PCRE: 3.68s
+* The large regular expression (about 2000 characters long) that
+ Unison uses with my preference file to decide whether a file should
+ be ignored or not. This expression is matched against a filename
+ about 20000 times.
+ - RE: 0.31s
+ - PCRE: 3.7s
+ However, RE is only faster than PCRE when there are more than about
+ 300 filenames.
39 TODO.txt
@@ -0,0 +1,39 @@
+
+High priority (before next release)
+=============
+* Improve the Perl regular expressions parser
+* Character classes (in the three regular expression parsers)
+
+* Reduce memory usage
+ - More compact representation of character sequences
+ - Special notation for "anything but this set of characters"
+ (more generally, optimize the compilation of regular expressions)
+* Simple optimisations
+ - alt containing alt
+ - epsilon elimination
+ - Seq (Seq (x,y), z) => Seq (x, Seq (y, z)) under some circumptances
+ (x or y has a fixed length)
+ ...
+
+* Test suite
+
+Medium priority
+===============
+* Implement back-references
+* Implement look-ahead and look-behind assertions
+
+Low priority
+============
+* Optimize the main loop for processor that are not register starved
+* Rewrite the main loops in C
+ (but keep the option to compile a pure OCaml version)
+* Limit the size of the cached DFAs by removing states that have not
+ been used recently
+* Documentation
+
+Other ideas
+===========
+* It would be great to have a more generic interface (parameterized
+ over some abstract tokens).
+* Str compatibility module
+ (should we implement string_partial_match?)
657 automata.ml
@@ -0,0 +1,657 @@
+(*
+ RE - A regular expression library
+
+ Copyright (C) 2001 Jerome Vouillon
+ email: Jerome.Vouillon@pps.jussieu.fr
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*)
+
+type sem = [ `Longest | `Shortest | `First ]
+
+type rep_kind = [ `Greedy | `Non_greedy ]
+
+type category = int
+type mark = int
+type idx = int
+
+type expr = { id : int; def : def }
+
+and def =
+ Cst of Cset.t
+ | Alt of expr list
+ | Seq of sem * expr * expr
+ | Eps
+ | Rep of rep_kind * sem * expr
+ | Mark of int
+ | Erase of int * int
+ | Before of category
+ | After of category
+
+let def e = e.def
+
+type mark_offsets = (int * int) list
+
+type e =
+ TSeq of e list * expr * sem
+ | TExp of mark_offsets * expr
+ | TMatch of mark_offsets
+
+(****)
+
+let print_kind ch k =
+ Format.fprintf ch "%s"
+ (match k with
+ `Shortest -> "short"
+ | `Longest -> "long"
+ | `First -> "first")
+
+let rec print_expr ch e =
+ match e.def with
+ Cst l ->
+ Format.fprintf ch "@[<3>(cst@ %a)@]" Cset.print l;
+ | Alt l ->
+ Format.fprintf ch "@[<3>(alt";
+ List.iter (fun e -> Format.fprintf ch "@ %a" print_expr e) l;
+ Format.fprintf ch ")@]"
+ | Seq (k, e, e') ->
+ Format.fprintf ch "@[<3>(seq %a@ %a@ %a)@]"
+ print_kind k print_expr e print_expr e'
+ | Eps ->
+ Format.fprintf ch "eps"
+ | Rep (rk, k, e) ->
+ Format.fprintf ch "@[<3>(rep@ %a %a)@]" print_kind k print_expr e
+ | Mark i ->
+ Format.fprintf ch "@[<3>(mark@ %d)@]" i
+ | Erase (b, e) ->
+ Format.fprintf ch "@[<3>(erase@ %d %d)@]" b e
+ | Before c ->
+ Format.fprintf ch "@[<3>(before@ %d)@]" c
+ | After c ->
+ Format.fprintf ch "@[<3>(after@ %d)@]" c
+
+let print_marks ch l =
+ match l with
+ [] ->
+ ()
+ | (a, i) :: r ->
+ Format.fprintf ch "%d-%d" a i;
+ List.iter (fun (a, i) -> Format.fprintf ch " %d-%d" a i) r
+
+let rec print_state_rec ch e y =
+ match e with
+ TMatch marks as x ->
+ Format.fprintf ch "@[<2>(Match@ %a)@]" print_marks marks
+ | TSeq (l', x, kind) ->
+ Format.fprintf ch "@[<2>(Seq@ ";
+ print_state_lst ch l' x;
+ Format.fprintf ch " %a)@]" print_expr x
+ | TExp (marks, {def = Eps}) ->
+ Format.fprintf ch "(Exp %d (%a) (eps))" y.id print_marks marks
+ | TExp (marks, x) ->
+ Format.fprintf ch "(Exp %d (%a) %a)" x.id print_marks marks print_expr x
+
+and print_state_lst ch l y =
+ match l with
+ [] ->
+ Format.fprintf ch "()"
+ | e :: rem ->
+ print_state_rec ch e y;
+ List.iter
+ (fun e ->
+ Format.fprintf ch " | ";
+ print_state_rec ch e y)
+ rem
+
+let print_state ch l = print_state_lst ch l { id = 0; def = Eps }
+
+(****)
+
+let rec first f l =
+ match l with
+ [] ->
+ None
+ | x :: r ->
+ match f x with
+ None -> first f r
+ | Some _ as res -> res
+
+(****)
+
+type ids = int ref
+let create_ids () = ref 0
+
+let eps_expr = { id = 0; def = Eps }
+
+let mk_expr ids def =
+ incr ids;
+ { id = !ids; def = def }
+
+let empty ids = mk_expr ids (Alt [])
+
+let cst ids s =
+ if s = [] then
+ empty ids
+ else
+ mk_expr ids (Cst s)
+
+let alt ids l =
+ match l with
+ [] -> empty ids
+ | [c] -> c
+ | l -> mk_expr ids (Alt l)
+
+let seq ids kind x y =
+ match x.def, y.def with
+ Alt [], _ -> x
+ | _, Alt [] -> y
+ | Eps, _ -> y
+ | _, Eps when kind = `First -> x
+ | _ -> mk_expr ids (Seq (kind, x, y))
+
+let eps ids = mk_expr ids Eps
+
+let rep ids kind sem x = mk_expr ids (Rep (kind, sem, x))
+
+let mark ids m = mk_expr ids (Mark m)
+
+let erase ids m m' = mk_expr ids (Erase (m, m'))
+
+let before ids c = mk_expr ids (Before c)
+
+let after ids c = mk_expr ids (After c)
+
+let texp marks x = TExp (marks, x)
+
+let tseq kind x y rem =
+ match x with
+ [] -> rem
+ | [TExp (marks, {def = Eps})] -> TExp (marks, y) :: rem
+ | _ -> TSeq (x, y, kind) :: rem
+
+(****)
+
+let rec rename ids x =
+ match x.def with
+ Cst _ | Eps | Mark _ | Erase _ | Before _ | After _ ->
+ mk_expr ids x.def
+ | Alt l ->
+ mk_expr ids (Alt (List.map (rename ids) l))
+ | Seq (k, y, z) ->
+ mk_expr ids (Seq (k, rename ids y, rename ids z))
+ | Rep (g, k, y) ->
+ mk_expr ids (Rep (g, k, rename ids y))
+
+(****)
+
+type hash = int
+type mark_infos = int array
+type status = [`Failed | `Match of mark_infos | `Running]
+type state = int * category * e list * status option ref * hash
+
+let dummy_state = (-1, -1, [], ref None, -1)
+
+let hash_combine h accu = accu * 65599 + h
+
+let rec hash_marks l accu =
+ match l with
+ [] -> accu
+ | (a, i) :: r -> hash_marks r (hash_combine a (hash_combine i accu))
+
+let rec hash_e l accu =
+ match l with
+ [] ->
+ accu
+ | TSeq (l', e, _) :: r ->
+ hash_e r (hash_combine 0x172a1bce (hash_combine e.id (hash_e l' accu)))
+ | TExp (marks, e) :: r ->
+ hash_e r
+ (hash_combine 0x2b4c0d77 (hash_combine e.id (hash_marks marks accu)))
+ | TMatch marks :: r ->
+ hash_e r (hash_combine 0x1c205ad5 (hash_marks marks accu))
+
+let hash_state idx cat desc =
+ hash_e desc (hash_combine idx (hash_combine cat 0)) land 0x3FFFFFFF
+
+let mk_state idx cat desc = (idx, cat, desc, ref None, hash_state idx cat desc)
+
+let create_state cat e = mk_state 0 cat [TExp ([], e)]
+
+let rec equal_e l1 l2 =
+ match l1, l2 with
+ [], [] ->
+ true
+ | TSeq (l1', e1, _) :: r1, TSeq (l2', e2, _) :: r2 ->
+ e1.id = e2.id && equal_e l1' l2' && equal_e r1 r2
+ | TExp (marks1, e1) :: r1, TExp (marks2, e2) :: r2 ->
+ e1.id = e2.id && marks1 = marks2 && equal_e r1 r2
+ | TMatch marks1 :: r1, TMatch marks2 :: r2 ->
+ marks1 = marks2 && equal_e r1 r2
+ | _ ->
+ false
+
+let equal_state (idx1, cat1, desc1, _, h1) (idx2, cat2, desc2, _, h2) =
+ (h1 : int) = h2 && (idx1 : int) = idx2 &&
+ (cat1 : int) = cat2 && equal_e desc1 desc2
+
+let compare_state (idx1, cat1, desc1, _, h1) (idx2, cat2, desc2, _, h2) =
+ let c = compare (h1 : int) h2 in
+ if c <> 0 then c else
+ let c = compare (cat1 : int) cat2 in
+ if c <> 0 then c else
+ compare desc1 desc2
+
+module States =
+ Hashtbl.Make
+ (struct
+ type t = state
+ let equal = equal_state
+ let hash (_, _, _, _, h) = h
+ end)
+
+(**** Find a free index ****)
+
+type working_area = bool array ref
+
+let create_working_area () = ref [| false |]
+
+let index_count w = Array.length !w
+
+let reset_table a = Array.fill a 0 (Array.length a) false
+
+let rec mark_used_indices tbl l =
+ List.iter
+ (fun x ->
+ match x with
+ TSeq (l, _, _) ->
+ mark_used_indices tbl l
+ | TExp (marks, _) ->
+ List.iter (fun (_, i) -> if i >= 0 then tbl.(i) <- true) marks
+ | TMatch marks ->
+ List.iter (fun (_, i) -> if i >= 0 then tbl.(i) <- true) marks)
+ l
+
+let rec find_free tbl idx len =
+ if idx = len || not tbl.(idx) then idx else find_free tbl (idx + 1) len
+
+let free_index tbl_ref l =
+ let tbl = !tbl_ref in
+ reset_table tbl;
+ mark_used_indices tbl l;
+ let len = Array.length tbl in
+ let idx = find_free tbl 0 len in
+ if idx = len then tbl_ref := Array.make (2 * len) false;
+ idx
+
+(**** Computation of the next state ****)
+
+let remove_matches l =
+ List.filter (fun x -> match x with TMatch _ -> false | _ -> true) l
+
+let rec split_at_match_rec l' l =
+ match l with
+ [] -> assert false
+ | TMatch _ as x :: r -> (List.rev l', remove_matches r)
+ | x :: r -> split_at_match_rec (x :: l') r
+
+let split_at_match l = split_at_match_rec [] l
+
+let rec remove_duplicates prev l y =
+ match l with
+ [] ->
+ ([], prev)
+ | TMatch _ as x :: r -> (* Truncate after first match *)
+ ([x], prev)
+ | TSeq (l', x, kind) :: r ->
+ let (l'', prev') = remove_duplicates prev l' x in
+ let (r', prev'') = remove_duplicates prev' r y in
+ (tseq kind l'' x r', prev'')
+ | TExp (marks, {def = Eps}) as e :: r ->
+ if List.memq y.id prev then
+ remove_duplicates prev r y
+ else
+ let (r', prev') = remove_duplicates (y.id :: prev) r y in
+ (e :: r', prev')
+ | TExp (marks, x) as e :: r ->
+ if List.memq x.id prev then
+ remove_duplicates prev r y
+ else
+ let (r', prev') = remove_duplicates (x.id :: prev) r y in
+ (e :: r', prev')
+
+let rec marks_set_idx used idx marks =
+ match marks with
+ (a, -1) :: rem ->
+ used := true;
+ (a, idx) :: marks_set_idx used idx rem
+ | _ ->
+ marks
+
+let rec set_idx used idx l =
+ match l with
+ [] ->
+ []
+ | TMatch marks :: r ->
+ TMatch (marks_set_idx used idx marks) :: set_idx used idx r
+ | TSeq (l', x, kind) :: r ->
+ TSeq (set_idx used idx l', x, kind) :: set_idx used idx r
+ | TExp (marks, x) :: r ->
+ TExp (marks_set_idx used idx marks, x) :: set_idx used idx r
+
+let rec filter_marks b e marks =
+ match marks with
+ [] ->
+ []
+ | (i, p) :: rem ->
+ if i >= b && i <= e then rem else (i, p) :: filter_marks b e rem
+
+let rec delta_1 marks c cat' cat x rem =
+(*Format.eprintf "%d@." x.id;*)
+ match x.def with
+ Cst s ->
+ if Cset.mem c s then texp marks eps_expr :: rem else rem
+ | Alt l ->
+ delta_2 marks c cat' cat l rem
+ | Seq (kind, y, z) ->
+ let y' = delta_1 marks c cat' cat y [] in
+ delta_seq c cat' cat kind y' z rem
+ | Rep (rep_kind, kind, y) ->
+ let y' = delta_1 marks c cat' cat y [] in
+ let (y'', marks') =
+ match
+ first
+ (fun x -> match x with TMatch marks -> Some marks | _ -> None) y'
+ with
+ None -> (y', marks)
+ | Some marks' -> (remove_matches y', marks')
+ in
+ begin match rep_kind with
+ `Greedy -> tseq kind y'' x (TMatch marks' :: rem)
+ | `Non_greedy -> TMatch marks :: tseq kind y'' x rem
+ end
+ | Eps ->
+ TMatch marks :: rem
+ | Mark i ->
+ TMatch ((i, -1) :: List.remove_assq i marks) :: rem
+ | Erase (b, e) ->
+ TMatch (filter_marks b e marks) :: rem
+ | Before cat'' ->
+ if cat land cat'' <> 0 then TMatch marks :: rem else rem
+ | After cat'' ->
+ if cat' land cat'' <> 0 then TMatch marks :: rem else rem
+
+and delta_2 marks c cat' cat l rem =
+ match l with
+ [] -> rem
+ | y :: r -> delta_1 marks c cat' cat y (delta_2 marks c cat' cat r rem)
+
+and delta_seq c cat' cat kind y z rem =
+ match
+ first (fun x -> match x with TMatch marks -> Some marks | _ -> None) y
+ with
+ None ->
+ tseq kind y z rem
+ | Some marks ->
+ match kind with
+ `Longest ->
+ tseq kind (remove_matches y) z (delta_1 marks c cat' cat z rem)
+ | `Shortest ->
+ delta_1 marks c cat' cat z (tseq kind (remove_matches y) z rem)
+ | `First ->
+ let (y', y'') = split_at_match y in
+ tseq kind y' z (delta_1 marks c cat' cat z (tseq kind y'' z rem))
+
+let rec delta_3 c cat' cat x rem =
+ match x with
+ TSeq (y, z, kind) ->
+ let y' = delta_4 c cat' cat y [] in
+ delta_seq c cat' cat kind y' z rem
+ | TExp (marks, e) ->
+ delta_1 marks c cat' cat e rem
+ | TMatch _ ->
+ x :: rem
+
+and delta_4 c cat' cat l rem =
+ match l with
+ [] -> rem
+ | y :: r -> delta_3 c cat' cat y (delta_4 c cat' cat r rem)
+
+let delta tbl_ref cat' char (_, cat, expr, _, _) =
+ let (expr', _) =
+ remove_duplicates [] (delta_4 char cat cat' expr []) eps_expr in
+ let idx = free_index tbl_ref expr' in
+ let used = ref false in
+ let expr'' = set_idx used idx expr' in
+ mk_state idx cat' expr''
+
+(****)
+
+let rec red_tr l =
+ match l with
+ [] | [_] ->
+ l
+ | ((s1, st1) as tr1) :: ((s2, st2) as tr2) :: rem ->
+ if equal_state st1 st2 then
+ red_tr ((Cset.union s1 s2, st1) :: rem)
+ else
+ tr1 :: red_tr (tr2 :: rem)
+
+let simpl_tr l =
+ List.sort
+ (fun (s1, _) (s2, _) -> compare s1 s2)
+ (red_tr (List.sort (fun (_, st1) (_, st2) -> compare_state st1 st2) l))
+
+(****)
+
+let rec prepend s x l =
+ match s, l with
+ [], _ ->
+ l
+ | _, [] ->
+ []
+ | (c, c') :: r, ([d, d'], x') :: r' when c' < d ->
+ prepend r x l
+ | (c, c') :: r, ([d, d'], x') :: r' ->
+ if c <= d then begin
+ if c' < d' then
+ ([d, c'], x @ x') :: prepend r x (([c' + 1, d'], x') :: r')
+ else
+ ([d, d'], x @ x') :: prepend s x r'
+ end else begin
+ if c > d' then
+ ([d, d'], x') :: prepend s x r'
+ else
+ ([d, c - 1], x') :: prepend s x (([c, d'], x') :: r')
+ end
+ | _ ->
+ assert false
+
+let prepend_deriv d l = List.fold_right (fun (s, x) l -> prepend s x l) d l
+
+let rec restrict s l =
+ match l with
+ [] ->
+ []
+ | (s', x') :: rem ->
+ let s'' = Cset.inter s s' in
+ if s'' = [] then
+ restrict s rem
+ else
+ (s'', x') :: restrict s rem
+
+let rec remove_marks b e rem =
+ if b > e then rem else remove_marks b (e - 1) ((e, -2) :: rem)
+
+let rec merge_marks old nw =
+ match nw with
+ [] ->
+ old
+ | (i, v) :: rem ->
+ let nw' = merge_marks (List.remove_assq i old) rem in
+ if v = -2 then
+ nw'
+ else
+ (i, v) :: nw'
+
+let rec prepend_marks_expr m e =
+ match e with
+ TSeq (l, e', s) -> TSeq (prepend_marks_expr_lst m l, e', s)
+ | TExp (m', e') -> TExp (merge_marks m m', e')
+ | TMatch m' -> TMatch (merge_marks m m')
+
+and prepend_marks_expr_lst m l =
+ List.map (prepend_marks_expr m) l
+
+let prepend_marks (m : mark_offsets) l =
+ List.map (fun (s, x) -> (s, prepend_marks_expr_lst m x)) l
+
+let rec deriv_1 all_chars categories marks cat x rem =
+ match x.def with
+ Cst s ->
+ prepend s [texp marks eps_expr] rem
+ | Alt l ->
+ deriv_2 all_chars categories marks cat l rem
+ | Seq (kind, y, z) ->
+ let y' = deriv_1 all_chars categories marks cat y [(all_chars, [])] in
+ deriv_seq all_chars categories cat kind y' z rem
+ | Rep (rep_kind, kind, y) ->
+ let y' = deriv_1 all_chars categories marks cat y [(all_chars, [])] in
+ List.fold_right
+ (fun (s, z) rem ->
+ let (z', marks') =
+ match
+ first
+ (fun z -> match z with TMatch marks -> Some marks | _ -> None)
+ z
+ with
+ None -> (z, marks)
+ | Some marks' -> (remove_matches z, marks')
+ in
+ prepend s
+ (match rep_kind with
+ `Greedy -> tseq kind z' x [TMatch marks']
+ | `Non_greedy -> TMatch marks :: tseq kind z' x [])
+ rem)
+ y' rem
+ | Eps ->
+ prepend all_chars [TMatch marks] rem
+ | Mark i ->
+ prepend all_chars [TMatch ((i, -1) :: List.remove_assq i marks)] rem
+ | Erase (b, e) ->
+ prepend all_chars
+ [TMatch (remove_marks b e (filter_marks b e marks))] rem
+ | Before cat' ->
+ prepend (List.assq cat' categories) [TMatch marks] rem
+ | After cat' ->
+ if cat land cat' <> 0 then prepend all_chars [TMatch marks] rem else rem
+
+and deriv_2 all_chars categories marks cat l rem =
+ match l with
+ [] -> rem
+ | y :: r -> deriv_1 all_chars categories marks cat y
+ (deriv_2 all_chars categories marks cat r rem)
+
+and deriv_seq all_chars categories cat kind y z rem =
+ if
+ List.exists
+ (fun (s, xl) ->
+ List.exists (fun x -> match x with TMatch _ -> true | _ -> false) xl)
+ y
+ then
+ let z' = deriv_1 all_chars categories [] cat z [(all_chars, [])] in
+ List.fold_right
+ (fun (s, y) rem ->
+ match
+ first (fun x -> match x with TMatch marks -> Some marks | _ -> None)
+ y
+ with
+ None ->
+ prepend s (tseq kind y z []) rem
+ | Some marks ->
+ let z'' = prepend_marks marks z' in
+ match kind with
+ `Longest ->
+ prepend s (tseq kind (remove_matches y) z []) (
+ prepend_deriv (restrict s z'') rem)
+ | `Shortest ->
+ prepend_deriv (restrict s z'') (
+ prepend s (tseq kind (remove_matches y) z []) rem)
+ | `First ->
+ let (y', y'') = split_at_match y in
+ prepend s (tseq kind y' z []) (
+ prepend_deriv (restrict s z'') (
+ prepend s (tseq kind y'' z []) rem)))
+ y rem
+ else
+ List.fold_right
+ (fun (s, xl) rem -> prepend s (tseq kind xl z []) rem) y rem
+
+let rec deriv_3 all_chars categories cat x rem =
+ match x with
+ TSeq (y, z, kind) ->
+ let y' = deriv_4 all_chars categories cat y [(all_chars, [])] in
+ deriv_seq all_chars categories cat kind y' z rem
+ | TExp (marks, e) ->
+ deriv_1 all_chars categories marks cat e rem
+ | TMatch _ ->
+ prepend all_chars [x] rem
+
+and deriv_4 all_chars categories cat l rem =
+ match l with
+ [] -> rem
+ | y :: r -> deriv_3 all_chars categories cat y
+ (deriv_4 all_chars categories cat r rem)
+
+let deriv tbl_ref all_chars categories (_, cat, expr, _, _) =
+ let der = deriv_4 all_chars categories cat expr [(all_chars, [])] in
+ simpl_tr
+ (List.fold_right
+ (fun (s, expr) rem ->
+ let (expr', _) = remove_duplicates [] expr eps_expr in
+(*
+Format.eprintf "@[<3>@[%a@]: %a / %a@]@." Cset.print s print_state expr print_state expr';
+*)
+ let idx = free_index tbl_ref expr' in
+ let used = ref false in
+ let expr'' = set_idx used idx expr' in
+ List.fold_right
+ (fun (cat', s') rem ->
+ let s'' = Cset.inter s s' in
+ if s'' = [] then rem else
+ (s'', mk_state idx cat' expr'') :: rem)
+ categories rem)
+ der [])
+
+(****)
+
+let flatten_match m =
+ let ma = List.fold_left (fun ma (i, _) -> max ma i) (-1) m in
+ let res = Array.create (ma + 1) (-1) in
+ List.iter (fun (i, v) -> res.(i) <- v) m;
+ res
+
+let status (_, _, desc, status, _) =
+ match !status with
+ Some st ->
+ st
+ | None ->
+ let st =
+ match desc with
+ [] -> `Failed
+ | TMatch m :: _ -> `Match (flatten_match m)
+ | _ -> `Running
+ in
+ status := Some st;
+ st
78 automata.mli
@@ -0,0 +1,78 @@
+
+(* Regular expressions *)
+
+type category = int
+type mark = int
+
+type sem = [ `Longest | `Shortest | `First ]
+type rep_kind = [ `Greedy | `Non_greedy ]
+
+type expr
+type def =
+ Cst of Cset.t
+ | Alt of expr list
+ | Seq of sem * expr * expr
+ | Eps
+ | Rep of rep_kind * sem * expr
+ | Mark of mark
+ | Erase of mark * mark
+ | Before of category
+ | After of category
+val def : expr -> def
+val print_expr : Format.formatter -> expr -> unit
+
+type ids
+val create_ids : unit -> ids
+
+val cst : ids -> Cset.t -> expr
+val empty : ids -> expr
+val alt : ids -> expr list -> expr
+val seq : ids -> sem -> expr -> expr -> expr
+val eps : ids -> expr
+val rep : ids -> rep_kind -> sem -> expr -> expr
+val mark : ids -> mark -> expr
+val erase : ids -> mark -> mark -> expr
+val before : ids -> category -> expr
+val after : ids -> category -> expr
+
+val rename : ids -> expr -> expr
+
+(****)
+
+(* States of the automata *)
+
+type idx = int
+type mark_offsets = (mark * idx) list
+type e =
+ TSeq of e list * expr * sem
+ | TExp of mark_offsets * expr
+ | TMatch of mark_offsets
+
+val print_state : Format.formatter -> e list -> unit
+
+type hash
+type mark_infos = int array
+type status = [`Failed | `Match of mark_infos | `Running]
+type state =
+ idx * category * e list * status option ref * hash
+val dummy_state : state
+val mk_state : idx -> category -> e list -> state
+val create_state : category -> expr -> state
+module States : Hashtbl.S with type key = state
+
+(****)
+
+(* Computation of the states following a given state *)
+
+type working_area
+val create_working_area : unit -> working_area
+val index_count : working_area -> int
+
+val delta : working_area -> category -> Cset.c -> state -> state
+val deriv :
+ working_area -> Cset.t -> (category * Cset.t) list -> state ->
+ (Cset.t * state) list
+
+(****)
+
+val status : state -> status
120 cset.ml
@@ -0,0 +1,120 @@
+(*
+ RE - A regular expression library
+
+ Copyright (C) 2001 Jerome Vouillon
+ email: Jerome.Vouillon@pps.jussieu.fr
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*)
+
+type c = int
+type t = (c * c) list
+
+let rec union l l' =
+ match l, l' with
+ _, [] -> l
+ | [], _ -> l'
+ | (c1, c2)::r, (c1', c2')::r' ->
+ if c2 + 1 < c1' then
+ (c1, c2)::union r l'
+ else if c2' + 1 < c1 then
+ (c1', c2')::union l r'
+ else if c2 < c2' then
+ union r ((min c1 c1', c2')::r')
+ else
+ union ((min c1 c1', c2)::r) r'
+
+let rec inter l l' =
+ match l, l' with
+ _, [] -> []
+ | [], _ -> []
+ | (c1, c2)::r, (c1', c2')::r' ->
+ if c2 < c1' then
+ inter r l'
+ else if c2' < c1 then
+ inter l r'
+ else if c2 < c2' then
+ (max c1 c1', c2)::inter r l'
+ else
+ (max c1 c1', c2')::inter l r'
+
+let rec diff l l' =
+ match l, l' with
+ _, [] -> l
+ | [], _ -> []
+ | (c1, c2)::r, (c1', c2')::r' ->
+ if c2 < c1' then
+ (c1, c2)::diff r l'
+ else if c2' < c1 then
+ diff l r'
+ else
+ let r'' = if c2' < c2 then (c2' + 1, c2) :: r else r in
+ if c1 < c1' then
+ (c1, c1' - 1)::diff r'' r'
+ else
+ diff r'' r'
+
+let single c = [c, c]
+
+let add c l = union (single c) l
+
+let seq c c' = if c <= c' then [c, c'] else [c', c]
+
+let rec offset o l =
+ match l with
+ [] -> []
+ | (c1, c2) :: r -> (c1 + o, c2 + o) :: offset o r
+
+let empty = []
+
+let rec mem (c : int) s =
+ match s with
+ [] -> false
+ | (c1, c2) :: rem -> if c <= c2 then c >= c1 else mem c rem
+
+(****)
+
+type hash = int
+
+let rec hash_rec l =
+ match l with
+ [] -> 0
+ | (i, j)::r -> i + 13 * j + 257 * hash_rec r
+let hash l = (hash_rec l) land 0x3FFFFFFF
+
+module Map =
+ Map.Make
+ (struct
+ type t = int * (int * int) list
+ let compare (i, u) (j, v) =
+ let c = compare i j in if c <> 0 then c else compare u v
+ end)
+
+(****)
+
+let print_one ch c1 c2 =
+ if c1 = c2 then
+ Format.fprintf ch "@ %d" c1
+ else
+ Format.fprintf ch "@ %d-%d" c1 c2
+
+let print ch l =
+ match l with
+ [] ->
+ ()
+ | (c1, c2) :: rem ->
+ print_one ch c1 c2;
+ List.iter
+ (fun (c1, c2) -> Format.fprintf ch "@ "; print_one ch c1 c2) rem
23 cset.mli
@@ -0,0 +1,23 @@
+
+(* Character sets, represented as sorted list of intervals *)
+
+type c = int
+type t = (c * c) list
+
+val union : t -> t -> t
+val inter : t -> t -> t
+val diff : t -> t -> t
+val offset : int -> t -> t
+
+val empty : t
+val single : c -> t
+val seq : c -> c -> t
+val add : c -> t -> t
+
+val mem : c -> t -> bool
+
+type hash
+val hash : t -> hash
+module Map : Map.S with type key = hash * t
+
+val print : Format.formatter -> t -> unit
24 depend
@@ -0,0 +1,24 @@
+automata.cmo: cset.cmi automata.cmi
+automata.cmx: cset.cmx automata.cmi
+cset.cmo: cset.cmi
+cset.cmx: cset.cmi
+re.cmo: cset.cmi automata.cmi re.cmi
+re.cmx: cset.cmx automata.cmx re.cmi
+re_emacs.cmo: re.cmi re_emacs.cmi
+re_emacs.cmx: re.cmx re_emacs.cmi
+re_glob.cmo: re.cmi re_glob.cmi
+re_glob.cmx: re.cmx re_glob.cmi
+re_perl.cmo: re.cmi re_perl.cmi
+re_perl.cmx: re.cmx re_perl.cmi
+re_posix.cmo: re.cmi re_posix.cmi
+re_posix.cmx: re.cmx re_posix.cmi
+re_str.cmo: re_emacs.cmi re.cmi re_str.cmi
+re_str.cmx: re_emacs.cmx re.cmx re_str.cmi
+automata.cmi: cset.cmi
+cset.cmi:
+re.cmi:
+re_emacs.cmi: re.cmi
+re_glob.cmi: re.cmi
+re_perl.cmi: re.cmi
+re_posix.cmi: re.cmi
+re_str.cmi:
953 re.ml
@@ -0,0 +1,953 @@
+(*
+ RE - A regular expression library
+
+ Copyright (C) 2001 Jerome Vouillon
+ email: Jerome.Vouillon@pps.jussieu.fr
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*)
+
+let rec first f l =
+ match l with
+ [] -> None
+ | x :: r -> match f x with
+ None -> first f r
+ | Some _ as res -> res
+
+let rec iter n f v = if n = 0 then v else iter (n - 1) f (f v)
+
+(****)
+
+let unknown = -2
+let break = -3
+
+type 'a match_info =
+ [ `Match of 'a
+ | `Failed
+ | `Running ]
+
+type state =
+ { idx : int;
+ (* Index of the current position in the position table.
+ Not yet computed transitions point to a dummy state where
+ [idx] is set to [unknown];
+ If [idx] is set to [break] for states that either always
+ succeed or always fail. *)
+ real_idx : int;
+ (* The real index, in case [idx] is set to [break] *)
+ next : state array;
+ (* Transition table, indexed by color *)
+ mutable final :
+ (Automata.category *
+ (Automata.idx * Automata.mark_infos match_info)) list;
+ (* Mapping from the category of the next character to
+ - the index where the next position should be saved
+ - possibly, the list of marks (and the corresponding indices)
+ corresponding to the best match *)
+ desc : Automata.state
+ (* Description of this state of the automata *) }
+
+(* Automata (compiled regular expression) *)
+type re =
+ { initial : Automata.expr;
+ (* The whole regular expression *)
+ mutable initial_states : (int * state) list;
+ (* Initial states, indexed by initial category *)
+ cols : string;
+ (* Color table *)
+ col_repr : string;
+ (* Table from colors to one character of this color *)
+ ncol : int;
+ (* Number of colors *)
+ lnl : int;
+ (* Color of the last newline *)
+ mutable tbl : Automata.working_area;
+ (* Temporary table used to compute the first available index
+ when computing a new state *)
+ states : state Automata.States.t;
+ (* States of the deterministic automata *)
+ group_count : int
+ (* Number of groups in the regular expression *) }
+
+let print_re ch re = Automata.print_expr ch re.initial
+
+(* Information used during matching *)
+type info =
+ { re : re;
+ (* The automata *)
+ i_cols : string;
+ (* Color table ([x.i_cols = x.re.cols])
+ Sortcut used for performance reasons *)
+ mutable positions : int array;
+ (* Array of mark positions
+ The mark are off by one for performance reasons *)
+ mutable pos : int;
+ (* Position where the match is started *)
+ mutable last : int
+ (* Position where the match should stop *) }
+
+(****)
+
+let cat_inexistant = 1
+let cat_letter = 2
+let cat_not_letter = 4
+let cat_newline = 8
+let cat_lastnewline = 16
+let cat_search_boundary = 32
+
+let category re c =
+ if c = -1 then cat_inexistant else
+ (* Special category for the last newline *)
+ if c = re.lnl then cat_lastnewline lor cat_newline lor cat_not_letter else
+ match re.col_repr.[c] with
+ 'a'..'z' | 'A'..'Z' ->
+ cat_letter
+ | '\n' ->
+ cat_not_letter lor cat_newline
+ | _ ->
+ cat_not_letter
+
+(****)
+
+let dummy_next = [||]
+
+let unknown_state =
+ { idx = unknown; real_idx = 0;
+ next = dummy_next; final = [];
+ desc = Automata.dummy_state }
+
+let count = ref 0
+let mk_state ncol ((idx, _, _, _, _) as desc) =
+ let break_state =
+ match Automata.status desc with
+ `Running -> false
+ | _ -> true
+ in
+ { idx = if break_state then break else idx;
+ real_idx = idx;
+ next = if break_state then dummy_next else Array.make ncol unknown_state;
+ final = [];
+ desc = desc }
+
+let find_state re desc =
+ try
+ Automata.States.find re.states desc
+ with Not_found ->
+ let st = mk_state re.ncol desc in
+ Automata.States.add re.states desc st;
+ st
+
+(**** Match with marks ****)
+
+let delta info cat c st =
+ let (idx, _, _, _, _) as desc = Automata.delta info.re.tbl cat c st.desc in
+ let len = Array.length info.positions in
+ if idx = len && len > 0 then begin
+ let pos = info.positions in
+ info.positions <- Array.make (2 * len) 0;
+ Array.blit pos 0 info.positions 0 len
+ end;
+ desc
+
+let validate info s pos st =
+ let c = Char.code info.i_cols.[Char.code s.[pos]] in
+ let cat = category info.re c in
+ let desc' = delta info cat c st in
+ let st' = find_state info.re desc' in
+ st.next.(c) <- st'
+
+(*
+let rec loop info s pos st =
+ if pos < info.last then
+ let st' = st.next.(Char.code info.i_cols.[Char.code s.[pos]]) in
+ let idx = st'.idx in
+ if idx >= 0 then begin
+ info.positions.(idx) <- pos;
+ loop info s (pos + 1) st'
+ end else if idx = break then begin
+ info.positions.(st'.real_idx) <- pos;
+ st'
+ end else begin (* Unknown *)
+ validate info s pos st;
+ loop info s pos st
+ end
+ else
+ st
+*)
+
+let rec loop info s pos st =
+ if pos < info.last then
+ let st' = st.next.(Char.code info.i_cols.[Char.code s.[pos]]) in
+ loop2 info s pos st st'
+ else
+ st
+
+and loop2 info s pos st st' =
+ let idx = st'.idx in
+ if idx >= 0 then begin
+ let pos = pos + 1 in
+ if pos < info.last then begin
+ (* It is important to place these reads before the write *)
+ (* But then, we don't have enough registers left to store the
+ right position. So, we store the position plus one. *)
+ let st'' = st'.next.(Char.code info.i_cols.[Char.code s.[pos]]) in
+ info.positions.(idx) <- pos;
+ loop2 info s pos st' st''
+ end else begin
+ info.positions.(idx) <- pos;
+ st'
+ end
+ end else if idx = break then begin
+ info.positions.(st'.real_idx) <- pos + 1;
+ st'
+ end else begin (* Unknown *)
+ validate info s pos st;
+ loop info s pos st
+ end
+
+let rec loop_no_mark info s pos last st =
+ if pos < last then
+ let st' = st.next.(Char.code info.i_cols.[Char.code s.[pos]]) in
+ let idx = st'.idx in
+ if idx >= 0 then
+ loop_no_mark info s (pos + 1) last st'
+ else if idx = break then
+ st'
+ else begin (* Unknown *)
+ validate info s pos st;
+ loop_no_mark info s pos last st
+ end
+ else
+ st
+
+let final info st cat =
+ try
+ List.assq cat st.final
+ with Not_found ->
+ let (idx, _, _, _, _) as st' = delta info cat (-1) st in
+ let res = (idx, Automata.status st') in
+ st.final <- (cat, res) :: st.final;
+ res
+
+let find_initial_state re cat =
+ try
+ List.assq cat re.initial_states
+ with Not_found ->
+ let st =
+ find_state re (Automata.create_state cat re.initial)
+ in
+ re.initial_states <- (cat, st) :: re.initial_states;
+ st
+
+let dummy_substrings = `Match ("", [], [||], 0)
+
+let get_color re s pos =
+ if pos < 0 then -1 else
+ let slen = String.length s in
+ if pos >= slen then -1 else
+ (* Special case for the last newline *)
+ if pos = slen - 1 && re.lnl <> -1 && s.[pos] = '\n' then re.lnl else
+ Char.code re.cols.[Char.code s.[pos]]
+
+let rec handle_last_newline info pos st groups =
+ let st' = st.next.(info.re.lnl) in
+ let idx = st'.idx in
+ if idx >= 0 then begin
+ if groups then info.positions.(idx) <- pos + 1;
+ st'
+ end else if idx = break then begin
+ if groups then info.positions.(st'.real_idx) <- pos + 1;
+ st'
+ end else begin (* Unknown *)
+ let c = info.re.lnl in
+ let real_c = Char.code info.i_cols.[Char.code '\n'] in
+ let cat = category info.re c in
+ let desc' = delta info cat real_c st in
+ let st' = find_state info.re desc' in
+ st.next.(c) <- st';
+ handle_last_newline info pos st groups
+ end
+
+let rec scan_str info s initial_state groups =
+ let pos = info.pos in
+ let last = info.last in
+ if
+ last = String.length s &&
+ info.re.lnl <> -1 &&
+ last > pos &&
+ s.[last - 1] = '\n'
+ then begin
+ info.last <- last - 1;
+ let st = scan_str info s initial_state groups in
+ if st.idx = break then
+ st
+ else
+ handle_last_newline info (last - 1) st groups
+ end else if groups then
+ loop info s pos initial_state
+ else
+ loop_no_mark info s pos last initial_state
+
+let match_str groups re s pos len =
+ let slen = String.length s in
+ let last = if len = -1 then slen else pos + len in
+ let info =
+ { re = re; i_cols = re.cols; pos = pos; last = last;
+ positions =
+ if groups then begin
+ let n = Automata.index_count re.tbl + 1 in
+ if n <= 10 then
+ [|0;0;0;0;0;0;0;0;0;0|]
+ else
+ Array.make n 0
+ end else
+ [||] }
+ in
+ let initial_cat =
+ if pos = 0 then
+ cat_search_boundary lor cat_inexistant
+ else
+ cat_search_boundary lor category re (get_color re s (pos - 1)) in
+ let initial_state = find_initial_state re initial_cat in
+ let st = scan_str info s initial_state groups in
+ let res =
+ if st.idx = break then
+ Automata.status st.desc
+ else
+ let final_cat =
+ if last = slen then
+ cat_search_boundary lor cat_inexistant
+ else
+ cat_search_boundary lor category re (get_color re s last) in
+ let (idx, res) = final info st final_cat in
+ if groups then info.positions.(idx) <- last + 1;
+ res
+ in
+ match res with
+ `Match m ->
+ `Match (s, m, info.positions, re.group_count)
+ | (`Failed | `Running) as res ->
+ res
+
+let mk_re init cols col_repr ncol lnl group_count =
+ { initial = init;
+ initial_states = [];
+ cols = cols;
+ col_repr = col_repr;
+ ncol = ncol;
+ lnl = lnl;
+ tbl = Automata.create_working_area ();
+ states = Automata.States.create 97;
+ group_count = group_count }
+
+(**** Character sets ****)
+
+let cany = [0, 255]
+
+let cseq c c' = Cset.seq (Char.code c) (Char.code c')
+let cadd c s = Cset.add (Char.code c) s
+let csingle c = Cset.single (Char.code c)
+
+let rec interval i j = if i > j then [] else i :: interval (i + 1) j
+
+let rec cset_hash_rec l =
+ match l with
+ [] -> 0
+ | (i, j)::r -> i + 13 * j + 257 * cset_hash_rec r
+let cset_hash l = (cset_hash_rec l) land 0x3FFFFFFF
+
+module CSetMap =
+ Map.Make
+ (struct
+ type t = int * (int * int) list
+ let compare (i, u) (j, v) =
+ let c = compare i j in if c <> 0 then c else compare u v
+ end)
+
+let trans_set cache cm s =
+ match s with
+ [i, j] when i = j ->
+ csingle cm.[i]
+ | _ ->
+ let v = (cset_hash_rec s, s) in
+ try
+ CSetMap.find v !cache
+ with Not_found ->
+ let l =
+ List.fold_right
+ (fun (i, j) l -> Cset.union (cseq cm.[i] cm.[j]) l)
+ s Cset.empty
+ in
+ cache := CSetMap.add v l !cache;
+ l
+
+(****)
+
+type sem_status = Compulsory | Indicative
+
+type regexp =
+ Set of Cset.t
+ | Sequence of regexp list
+ | Alternative of regexp list
+ | Repeat of regexp * int * int option
+ | Beg_of_line | End_of_line
+ | Beg_of_word | End_of_word | Not_bound
+ | Beg_of_str | End_of_str
+ | Last_end_of_line | Start | Stop
+ | Sem of Automata.sem * regexp
+ | Sem_greedy of Automata.rep_kind * regexp
+ | Group of regexp | No_group of regexp | Nest of regexp
+ | Case of regexp | No_case of regexp
+ | Intersection of regexp list
+ | Complement of regexp list
+ | Difference of regexp * regexp
+
+let rec is_charset r =
+ match r with
+ Set _ ->
+ true
+ | Alternative l | Intersection l | Complement l ->
+ List.for_all is_charset l
+ | Difference (r, r') ->
+ is_charset r && is_charset r'
+ | Sem (_, r) | Sem_greedy (_, r)
+ | No_group r | Case r | No_case r ->
+ is_charset r
+ | Sequence _ | Repeat _ | Beg_of_line | End_of_line
+ | Beg_of_word | End_of_word | Beg_of_str | End_of_str
+ | Not_bound | Last_end_of_line | Start | Stop | Group _ | Nest _ ->
+ false
+
+(**** Colormap ****)
+
+(*XXX Use a better algorithm allowing non-contiguous regions? *)
+let rec split s cm =
+ match s with
+ [] -> ()
+ | (i, j)::r -> cm.[i] <- '\001'; cm.[j + 1] <- '\001'; split r cm
+
+let cupper =
+ Cset.union (cseq 'A' 'Z') (Cset.union (cseq '\192' '\214') (cseq '\216' '\222'))
+let clower = Cset.offset 32 cupper
+let calpha = cadd '\170' (cadd '\186' (Cset.union clower cupper))
+let cdigit = cseq '0' '9'
+let calnum = Cset.union calpha cdigit
+let cword = cadd '_' calnum
+
+let rec colorize c regexp =
+ let lnl = ref false in
+ let rec colorize regexp =
+ match regexp with
+ Set s -> split s c
+ | Sequence l -> List.iter colorize l
+ | Alternative l -> List.iter colorize l
+ | Repeat (r, _, _) -> colorize r
+ | Beg_of_line | End_of_line -> split (csingle '\n') c
+ | Beg_of_word | End_of_word
+ | Not_bound -> split cword c
+ | Beg_of_str | End_of_str
+ | Start | Stop | Not_bound -> ()
+ | Last_end_of_line -> lnl := true
+ | Sem (_, r)
+ | Sem_greedy (_, r)
+ | Group r | No_group r
+ | Nest r -> colorize r
+ | Case _ | No_case _
+ | Intersection _
+ | Complement _
+ | Difference _ -> assert false
+ in
+ colorize regexp;
+ !lnl
+
+let make_cmap () = String.make 257 '\000'
+
+let flatten_cmap cm =
+ let c = String.create 256 in
+ let col_repr = String.create 256 in
+ let v = ref 0 in
+ c.[0] <- '\000';
+ col_repr.[0] <- '\000';
+ for i = 1 to 255 do
+ if cm.[i] <> '\000' then incr v;
+ c.[i] <- Char.chr !v;
+ col_repr.[!v] <- Char.chr i
+ done;
+ (c, String.sub col_repr 0 (!v + 1), !v + 1)
+
+(**** Compilation ****)
+
+let sequence l =
+ match l with
+ [x] -> x
+ | l -> Sequence l
+
+let rec merge_sequences l =
+ match l with
+ [] ->
+ l
+ | Alternative l' :: r ->
+ merge_sequences (l' @ r)
+ | Sequence (x :: y) :: r ->
+ begin match merge_sequences r with
+ Sequence (x' :: y') :: r' when x = x' ->
+ Sequence [x; Alternative [sequence y; sequence y']] :: r'
+ | r' ->
+ Sequence (x :: y) :: r'
+ end
+ | x :: r ->
+ x :: merge_sequences r
+
+module A = Automata
+
+let enforce_kind ids kind kind' cr =
+ match kind, kind' with
+ `First, `First -> cr
+ | `First, k -> A.seq ids k cr (A.eps ids)
+ | _ -> cr
+
+(* XXX should probably compute a category mask *)
+let rec translate ids kind ign_group ign_case greedy pos cache c r =
+ match r with
+ Set s ->
+ (A.cst ids (trans_set cache c s), kind)
+ | Sequence l ->
+ (trans_seq ids kind ign_group ign_case greedy pos cache c l, kind)
+ | Alternative l ->
+ begin match merge_sequences l with
+ [r'] ->
+ let (cr, kind') =
+ translate ids kind ign_group ign_case greedy pos cache c r' in
+ (enforce_kind ids kind kind' cr, kind)
+ | l' ->
+ (A.alt ids
+ (List.map
+ (fun r' ->
+ let (cr, kind') =
+ translate ids kind ign_group ign_case greedy
+ pos cache c r' in
+ enforce_kind ids kind kind' cr)
+ (merge_sequences l)),
+ kind)
+ end
+ | Repeat (r', i, j) ->
+ let (cr, kind') =
+ translate ids kind ign_group ign_case greedy pos cache c r' in
+ let rem =
+ match j with
+ None ->
+ A.rep ids greedy kind' cr
+ | Some j ->
+ let f =
+ match greedy with
+ `Greedy ->
+ fun rem ->
+ A.alt ids
+ [A.seq ids kind' (A.rename ids cr) rem; A.eps ids]
+ | `Non_greedy ->
+ fun rem ->
+ A.alt ids
+ [A.eps ids; A.seq ids kind' (A.rename ids cr) rem]
+ in
+ iter (j - i) f (A.eps ids)
+ in
+ (iter i (fun rem -> A.seq ids kind' (A.rename ids cr) rem) rem, kind)
+ | Beg_of_line ->
+ (A.after ids (cat_inexistant lor cat_newline), kind)
+ | End_of_line ->
+ (A.before ids (cat_inexistant lor cat_newline), kind)
+ | Beg_of_word ->
+ (A.seq ids `First
+ (A.after ids (cat_inexistant lor cat_not_letter))
+ (A.before ids (cat_inexistant lor cat_letter)),
+ kind)
+ | End_of_word ->
+ (A.seq ids `First
+ (A.after ids (cat_inexistant lor cat_letter))
+ (A.before ids (cat_inexistant lor cat_not_letter)),
+ kind)
+ | Not_bound ->
+ (A.alt ids [A.seq ids `First
+ (A.after ids cat_letter)
+ (A.before ids cat_letter);
+ A.seq ids `First
+ (A.after ids cat_letter)
+ (A.before ids cat_letter)],
+ kind)
+ | Beg_of_str ->
+ (A.after ids cat_inexistant, kind)
+ | End_of_str ->
+ (A.before ids cat_inexistant, kind)
+ | Last_end_of_line ->
+ (A.before ids (cat_inexistant lor cat_lastnewline), kind)
+ | Start ->
+ (A.after ids cat_search_boundary, kind)
+ | Stop ->
+ (A.before ids cat_search_boundary, kind)
+ | Sem (kind', r') ->
+ let (cr, kind'') =
+ translate ids kind' ign_group ign_case greedy pos cache c r' in
+ (enforce_kind ids kind' kind'' cr,
+ kind')
+ | Sem_greedy (greedy', r') ->
+ translate ids kind ign_group ign_case greedy' pos cache c r'
+ | Group r' ->
+ if ign_group then
+ translate ids kind ign_group ign_case greedy pos cache c r'
+ else
+ let p = !pos in
+ pos := !pos + 2;
+ let (cr, kind') =
+ translate ids kind ign_group ign_case greedy pos cache c r' in
+ (A.seq ids `First (A.mark ids p) (
+ A.seq ids `First cr (A.mark ids (p + 1))),
+ kind')
+ | No_group r' ->
+ translate ids kind true ign_case greedy pos cache c r'
+ | Nest r' ->
+ let b = !pos in
+ let (cr, kind') =
+ translate ids kind ign_group ign_case greedy pos cache c r'
+ in
+ let e = !pos - 1 in
+ if e < b then
+ (cr, kind')
+ else
+ (A.seq ids `First (A.erase ids b e) cr, kind')
+ | Difference _ | Complement _ | Intersection _ | No_case _ | Case _ ->
+ assert false
+
+and trans_seq ids kind ign_group ign_case greedy pos cache c l =
+ match l with
+ [] ->
+ A.eps ids
+ | [r] ->
+ let (cr', kind') =
+ translate ids kind ign_group ign_case greedy pos cache c r in
+ enforce_kind ids kind kind' cr'
+ | r :: rem ->
+ let (cr', kind') =
+ translate ids kind ign_group ign_case greedy pos cache c r in
+ let cr'' =
+ trans_seq ids kind ign_group ign_case greedy pos cache c rem in
+ if A.def cr'' = A.Eps then
+ cr'
+ else if A.def cr' = A.Eps then
+ cr''
+ else
+ A.seq ids kind' cr' cr''
+
+(**** Case ****)
+
+let case_insens s =
+ Cset.union s (Cset.union (Cset.offset 32 (Cset.inter s cupper))
+ (Cset.offset (-32) (Cset.inter s clower)))
+
+let as_set r =
+ match r with
+ Set s -> s
+ | _ -> assert false
+
+(* XXX Should split alternatives into (1) charsets and (2) more
+ complex regular expressions; alternative should therefore probably
+ be flatten here *)
+let rec handle_case ign_case r =
+ match r with
+ Set s ->
+ Set (if ign_case then case_insens s else s)
+ | Sequence l ->
+ Sequence (List.map (handle_case ign_case) l)
+ | Alternative l ->
+ let l' = List.map (handle_case ign_case) l in
+ if is_charset (Alternative l') then
+ Set (List.fold_left (fun s r -> Cset.union s (as_set r)) Cset.empty l')
+ else
+ Alternative l'
+ | Repeat (r, i, j) ->
+ Repeat (handle_case ign_case r, i, j)
+ | Beg_of_line | End_of_line | Beg_of_word | End_of_word | Not_bound
+ | Beg_of_str | End_of_str | Last_end_of_line | Start | Stop ->
+ r
+ | Sem (k, r) ->
+ let r' = handle_case ign_case r in
+ if is_charset r' then r' else
+ Sem (k, r')
+ | Sem_greedy (k, r) ->
+ let r' = handle_case ign_case r in
+ if is_charset r' then r' else
+ Sem_greedy (k, r')
+ | Group r ->
+ Group (handle_case ign_case r)
+ | No_group r ->
+ let r' = handle_case ign_case r in
+ if is_charset r' then r' else
+ No_group r'
+ | Nest r ->
+ let r' = handle_case ign_case r in
+ if is_charset r' then r' else
+ Nest r'
+ | Case r ->
+ handle_case false r
+ | No_case r ->
+ handle_case true r
+ | Intersection l ->
+ let l' = List.map (fun r -> handle_case ign_case r) l in
+ Set (List.fold_left (fun s r -> Cset.inter s (as_set r)) cany l')
+ | Complement l ->
+ let l' = List.map (fun r -> handle_case ign_case r) l in
+ Set (Cset.diff cany
+ (List.fold_left (fun s r -> Cset.union s (as_set r))
+ Cset.empty l'))
+ | Difference (r, r') ->
+ Set (Cset.inter (as_set (handle_case ign_case r))
+ (Cset.diff cany (as_set (handle_case ign_case r'))))
+
+(****)
+
+let compile_1 regexp =
+ let regexp = handle_case false regexp in