From 3e5543c180b16b95fe01c07c827d98fdba8d004f Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Fri, 26 Aug 2022 19:11:04 -0400
Subject: [PATCH] syntax: permit empty character classes

An empty character class is effectively a way to write something that
can never match anything. The regex crate has pretty much always
returned an error for such things because it was never taught how to
handle "always fail" states. Partly because I just didn't think about it
when initially writing the regex engines and partly because it isn't
often useful.

With that said, it should be supported for completeness and because
there is no real reason to not support it. Moreover, it can be useful in
certain contexts where regexes are generated and you want to insert an
expression that can never match. It's somewhat contrived, but it
happens when the interface is a regex pattern.

Previously, the ban on empty character classes was implemented in the
regex-syntax crate. But with the rewrite in #656 getting closer and
closer to landing, it's now time to relax this restriction. However, we
do keep the overall restriction in the 'regex' API by returning an error
in the NFA compiler. Once #656 is done, the new regex engines will
permit this case.
---
 regex-syntax/src/hir/mod.rs       | 14 ------
 regex-syntax/src/hir/translate.rs | 73 +++----------------------------
 src/compile.rs                    | 12 ++++-
 3 files changed, 15 insertions(+), 84 deletions(-)

diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs
index 1af8d4c32..23fef78f9 100644
--- a/regex-syntax/src/hir/mod.rs
+++ b/regex-syntax/src/hir/mod.rs
@@ -78,21 +78,8 @@ pub enum ErrorKind {
     /// available, and the regular expression required Unicode aware case
     /// insensitivity.
     UnicodeCaseUnavailable,
-    /// This occurs when the translator attempts to construct a character class
-    /// that is empty.
-    ///
-    /// Note that this restriction in the translator may be removed in the
-    /// future.
-    EmptyClassNotAllowed,
 }
 
-// BREADCRUMBS:
-//
-// Remove EmptyClassNotAllowed
-// Make errors non_exhaustive
-// Simplify repetitions (get rid of ZeroOrOne, OneOrMore etc)
-// Get rid of deprecated things
-
 impl std::error::Error for Error {}
 
 impl fmt::Display for Error {
@@ -118,7 +105,6 @@ impl fmt::Display for ErrorKind {
                 "Unicode-aware case insensitivity matching is not available \
                  (make sure the unicode-case feature is enabled)"
             }
-            EmptyClassNotAllowed => "empty character classes are not allowed",
         };
         f.write_str(msg)
     }
diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs
index 04409cf95..d7686988a 100644
--- a/regex-syntax/src/hir/translate.rs
+++ b/regex-syntax/src/hir/translate.rs
@@ -322,12 +322,6 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
                         ast.negated,
                         &mut cls,
                     )?;
-                    if cls.ranges().is_empty() {
-                        return Err(self.error(
-                            ast.span,
-                            ErrorKind::EmptyClassNotAllowed,
-                        ));
-                    }
                     let expr = Hir::class(hir::Class::Unicode(cls));
                     self.push(HirFrame::Expr(expr));
                 } else {
@@ -337,13 +331,6 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
                         ast.negated,
                         &mut cls,
                     )?;
-                    if cls.ranges().is_empty() {
-                        return Err(self.error(
-                            ast.span,
-                            ErrorKind::EmptyClassNotAllowed,
-                        ));
-                    }
-
                     let expr = Hir::class(hir::Class::Bytes(cls));
                     self.push(HirFrame::Expr(expr));
                 }
@@ -839,11 +826,6 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
                 ast_class.negated,
                 class,
             )?;
-            if class.ranges().is_empty() {
-                let err = self
-                    .error(ast_class.span, ErrorKind::EmptyClassNotAllowed);
-                return Err(err);
-            }
         }
         result
     }
@@ -2357,16 +2339,7 @@ mod tests {
     #[test]
     #[cfg(feature = "unicode-gencat")]
     fn class_unicode_any_empty() {
-        assert_eq!(
-            t_err(r"\P{any}"),
-            TestError {
-                kind: hir::ErrorKind::EmptyClassNotAllowed,
-                span: Span::new(
-                    Position::new(0, 1, 1),
-                    Position::new(7, 1, 8)
-                ),
-            }
-        );
+        assert_eq!(t(r"\P{any}"), hir_uclass(&[]),);
     }
 
     #[test]
@@ -2518,27 +2491,9 @@ mod tests {
             }
         );
         #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))]
-        assert_eq!(
-            t_err(r"[^\s\S]"),
-            TestError {
-                kind: hir::ErrorKind::EmptyClassNotAllowed,
-                span: Span::new(
-                    Position::new(0, 1, 1),
-                    Position::new(7, 1, 8)
-                ),
-            }
-        );
+        assert_eq!(t(r"[^\s\S]"), hir_uclass(&[]),);
         #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))]
-        assert_eq!(
-            t_err(r"(?-u)[^\s\S]"),
-            TestError {
-                kind: hir::ErrorKind::EmptyClassNotAllowed,
-                span: Span::new(
-                    Position::new(5, 1, 6),
-                    Position::new(12, 1, 13)
-                ),
-            }
-        );
+        assert_eq!(t_bytes(r"(?-u)[^\s\S]"), hir_bclass(&[]),);
     }
 
     #[test]
@@ -2686,27 +2641,9 @@ mod tests {
             hir_uclass(&[('C', 'C'), ('c', 'c')])
         );
 
-        assert_eq!(
-            t_err(r"[^a-c[^c]]"),
-            TestError {
-                kind: hir::ErrorKind::EmptyClassNotAllowed,
-                span: Span::new(
-                    Position::new(0, 1, 1),
-                    Position::new(10, 1, 11)
-                ),
-            }
-        );
+        assert_eq!(t(r"[^a-c[^c]]"), hir_uclass(&[]),);
         #[cfg(feature = "unicode-case")]
-        assert_eq!(
-            t_err(r"(?i)[^a-c[^c]]"),
-            TestError {
-                kind: hir::ErrorKind::EmptyClassNotAllowed,
-                span: Span::new(
-                    Position::new(4, 1, 5),
-                    Position::new(14, 1, 15)
-                ),
-            }
-        );
+        assert_eq!(t(r"(?i)[^a-c[^c]]"), hir_uclass(&[]),);
     }
 
     #[test]
diff --git a/src/compile.rs b/src/compile.rs
index 90ca25015..361ea4cb7 100644
--- a/src/compile.rs
+++ b/src/compile.rs
@@ -457,7 +457,11 @@ impl Compiler {
     fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty {
         use std::mem::size_of;
 
-        assert!(!ranges.is_empty());
+        if ranges.is_empty() {
+            return Err(Error::Syntax(
+                "empty character classes are not allowed".to_string(),
+            ));
+        }
         if self.compiled.uses_bytes() {
             Ok(Some(CompileClass { c: self, ranges }.compile()?))
         } else {
@@ -482,7 +486,11 @@ impl Compiler {
         &mut self,
         ranges: &[hir::ClassBytesRange],
     ) -> ResultOrEmpty {
-        debug_assert!(!ranges.is_empty());
+        if ranges.is_empty() {
+            return Err(Error::Syntax(
+                "empty character classes are not allowed".to_string(),
+            ));
+        }
 
         let first_split_entry = self.insts.len();
         let mut holes = vec![];