Permalink
Cannot retrieve contributors at this time
Join GitHub today
GitHub is home to over 31 million developers working together to host and review code, manage projects, and build software together.
Sign up
Find file
Copy path
Fetching contributors…
| // Copyright 2018 The Rust Project Developers. See the COPYRIGHT | |
| // file at the top-level directory of this distribution and at | |
| // http://rust-lang.org/COPYRIGHT. | |
| // | |
| // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | |
| // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | |
| // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | |
| // option. This file may not be copied, modified, or distributed | |
| // except according to those terms. | |
| /*! | |
| Defines a high-level intermediate representation for regular expressions. | |
| */ | |
| use std::char; | |
| use std::cmp; | |
| use std::error; | |
| use std::fmt; | |
| use std::u8; | |
| use ast::Span; | |
| use hir::interval::{Interval, IntervalSet, IntervalSetIter}; | |
| use unicode; | |
| pub use hir::visitor::{Visitor, visit}; | |
| mod interval; | |
| pub mod literal; | |
| pub mod print; | |
| pub mod translate; | |
| mod visitor; | |
| /// An error that can occur while translating an `Ast` to a `Hir`. | |
| #[derive(Clone, Debug, Eq, PartialEq)] | |
| pub struct Error { | |
| /// The kind of error. | |
| kind: ErrorKind, | |
| /// The original pattern that the translator's Ast was parsed from. Every | |
| /// span in an error is a valid range into this string. | |
| pattern: String, | |
| /// The span of this error, derived from the Ast given to the translator. | |
| span: Span, | |
| } | |
| impl Error { | |
| /// Return the type of this error. | |
| pub fn kind(&self) -> &ErrorKind { | |
| &self.kind | |
| } | |
| /// The original pattern string in which this error occurred. | |
| /// | |
| /// Every span reported by this error is reported in terms of this string. | |
| pub fn pattern(&self) -> &str { | |
| &self.pattern | |
| } | |
| /// Return the span at which this error occurred. | |
| pub fn span(&self) -> &Span { | |
| &self.span | |
| } | |
| } | |
| /// The type of an error that occurred while building an `Hir`. | |
| #[derive(Clone, Debug, Eq, PartialEq)] | |
| pub enum ErrorKind { | |
| /// This error occurs when a Unicode feature is used when Unicode | |
| /// support is disabled. For example `(?-u:\pL)` would trigger this error. | |
| UnicodeNotAllowed, | |
| /// This error occurs when translating a pattern that could match a byte | |
| /// sequence that isn't UTF-8 and `allow_invalid_utf8` was disabled. | |
| InvalidUtf8, | |
| /// This occurs when an unrecognized Unicode property name could not | |
| /// be found. | |
| UnicodePropertyNotFound, | |
| /// This occurs when an unrecognized Unicode property value could not | |
| /// be found. | |
| UnicodePropertyValueNotFound, | |
| /// This occurs when the translator attempts to construct a character class | |
| /// that is empty. | |
| /// | |
| /// Note that this restriction in the translator may be removed in the | |
| /// future. | |
| EmptyClassNotAllowed, | |
| /// Hints that destructuring should not be exhaustive. | |
| /// | |
| /// This enum may grow additional variants, so this makes sure clients | |
| /// don't count on exhaustive matching. (Otherwise, adding a new variant | |
| /// could break existing code.) | |
| #[doc(hidden)] | |
| __Nonexhaustive, | |
| } | |
| impl ErrorKind { | |
| fn description(&self) -> &str { | |
| use self::ErrorKind::*; | |
| match *self { | |
| UnicodeNotAllowed => "Unicode not allowed here", | |
| InvalidUtf8 => "pattern can match invalid UTF-8", | |
| UnicodePropertyNotFound => "Unicode property not found", | |
| UnicodePropertyValueNotFound => "Unicode property value not found", | |
| EmptyClassNotAllowed => "empty character classes are not allowed", | |
| _ => unreachable!(), | |
| } | |
| } | |
| } | |
| impl error::Error for Error { | |
| fn description(&self) -> &str { | |
| self.kind.description() | |
| } | |
| } | |
| impl fmt::Display for Error { | |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | |
| ::error::Formatter::from(self).fmt(f) | |
| } | |
| } | |
| impl fmt::Display for ErrorKind { | |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | |
| f.write_str(self.description()) | |
| } | |
| } | |
| /// A high-level intermediate representation (HIR) for a regular expression. | |
| /// | |
| /// The HIR of a regular expression represents an intermediate step between its | |
| /// abstract syntax (a structured description of the concrete syntax) and | |
| /// compiled byte codes. The purpose of HIR is to make regular expressions | |
| /// easier to analyze. In particular, the AST is much more complex than the | |
| /// HIR. For example, while an AST supports arbitrarily nested character | |
| /// classes, the HIR will flatten all nested classes into a single set. The HIR | |
| /// will also "compile away" every flag present in the concrete syntax. For | |
| /// example, users of HIR expressions never need to worry about case folding; | |
| /// it is handled automatically by the translator (e.g., by translating `(?i)A` | |
| /// to `[aA]`). | |
| /// | |
| /// If the HIR was produced by a translator that disallows invalid UTF-8, then | |
| /// the HIR is guaranteed to match UTF-8 exclusively. | |
| /// | |
| /// This type defines its own destructor that uses constant stack space and | |
| /// heap space proportional to the size of the HIR. | |
| /// | |
| /// The specific type of an HIR expression can be accessed via its `kind` | |
| /// or `into_kind` methods. This extra level of indirection exists for two | |
| /// reasons: | |
| /// | |
| /// 1. Construction of an HIR expression *must* use the constructor methods | |
| /// on this `Hir` type instead of building the `HirKind` values directly. | |
| /// This permits construction to enforce invariants like "concatenations | |
| /// always consist of two or more sub-expressions." | |
| /// 2. Every HIR expression contains attributes that are defined inductively, | |
| /// and can be computed cheaply during the construction process. For | |
| /// example, one such attribute is whether the expression must match at the | |
| /// beginning of the text. | |
| /// | |
| /// Also, an `Hir`'s `fmt::Display` implementation prints an HIR as a regular | |
| /// expression pattern string, and uses constant stack space and heap space | |
| /// proportional to the size of the `Hir`. | |
| #[derive(Clone, Debug, Eq, PartialEq)] | |
| pub struct Hir { | |
| /// The underlying HIR kind. | |
| kind: HirKind, | |
| /// Analysis info about this HIR, computed during construction. | |
| info: HirInfo, | |
| } | |
| /// The kind of an arbitrary `Hir` expression. | |
| #[derive(Clone, Debug, Eq, PartialEq)] | |
| pub enum HirKind { | |
| /// The empty regular expression, which matches everything, including the | |
| /// empty string. | |
| Empty, | |
| /// A single literal character that matches exactly this character. | |
| Literal(Literal), | |
| /// A single character class that matches any of the characters in the | |
| /// class. A class can either consist of Unicode scalar values as | |
| /// characters, or it can use bytes. | |
| Class(Class), | |
| /// An anchor assertion. An anchor assertion match always has zero length. | |
| Anchor(Anchor), | |
| /// A word boundary assertion, which may or may not be Unicode aware. A | |
| /// word boundary assertion match always has zero length. | |
| WordBoundary(WordBoundary), | |
| /// A repetition operation applied to a child expression. | |
| Repetition(Repetition), | |
| /// A possibly capturing group, which contains a child expression. | |
| Group(Group), | |
| /// A concatenation of expressions. A concatenation always has at least two | |
| /// child expressions. | |
| /// | |
| /// A concatenation matches only if each of its child expression matches | |
| /// one after the other. | |
| Concat(Vec<Hir>), | |
| /// An alternation of expressions. An alternation always has at least two | |
| /// child expressions. | |
| /// | |
| /// An alternation matches only if at least one of its child expression | |
| /// matches. If multiple expressions match, then the leftmost is preferred. | |
| Alternation(Vec<Hir>), | |
| } | |
| impl Hir { | |
| /// Returns a reference to the underlying HIR kind. | |
| pub fn kind(&self) -> &HirKind { | |
| &self.kind | |
| } | |
| /// Consumes ownership of this HIR expression and returns its underlying | |
| /// `HirKind`. | |
| pub fn into_kind(mut self) -> HirKind { | |
| use std::mem; | |
| mem::replace(&mut self.kind, HirKind::Empty) | |
| } | |
| /// Returns an empty HIR expression. | |
| /// | |
| /// An empty HIR expression always matches, including the empty string. | |
| pub fn empty() -> Hir { | |
| let mut info = HirInfo::new(); | |
| info.set_always_utf8(true); | |
| info.set_all_assertions(true); | |
| info.set_anchored_start(false); | |
| info.set_anchored_end(false); | |
| info.set_any_anchored_start(false); | |
| info.set_any_anchored_end(false); | |
| info.set_match_empty(true); | |
| Hir { | |
| kind: HirKind::Empty, | |
| info: info, | |
| } | |
| } | |
| /// Creates a literal HIR expression. | |
| /// | |
| /// If the given literal has a `Byte` variant with an ASCII byte, then this | |
| /// method panics. This enforces the invariant that `Byte` variants are | |
| /// only used to express matching of invalid UTF-8. | |
| pub fn literal(lit: Literal) -> Hir { | |
| if let Literal::Byte(b) = lit { | |
| assert!(b > 0x7F); | |
| } | |
| let mut info = HirInfo::new(); | |
| info.set_always_utf8(lit.is_unicode()); | |
| info.set_all_assertions(false); | |
| info.set_anchored_start(false); | |
| info.set_anchored_end(false); | |
| info.set_any_anchored_start(false); | |
| info.set_any_anchored_end(false); | |
| info.set_match_empty(false); | |
| Hir { | |
| kind: HirKind::Literal(lit), | |
| info: info, | |
| } | |
| } | |
| /// Creates a class HIR expression. | |
| pub fn class(class: Class) -> Hir { | |
| let mut info = HirInfo::new(); | |
| info.set_always_utf8(class.is_always_utf8()); | |
| info.set_all_assertions(false); | |
| info.set_anchored_start(false); | |
| info.set_anchored_end(false); | |
| info.set_any_anchored_start(false); | |
| info.set_any_anchored_end(false); | |
| info.set_match_empty(false); | |
| Hir { | |
| kind: HirKind::Class(class), | |
| info: info, | |
| } | |
| } | |
| /// Creates an anchor assertion HIR expression. | |
| pub fn anchor(anchor: Anchor) -> Hir { | |
| let mut info = HirInfo::new(); | |
| info.set_always_utf8(true); | |
| info.set_all_assertions(true); | |
| info.set_anchored_start(false); | |
| info.set_anchored_end(false); | |
| info.set_any_anchored_start(false); | |
| info.set_any_anchored_end(false); | |
| info.set_match_empty(true); | |
| if let Anchor::StartText = anchor { | |
| info.set_anchored_start(true); | |
| info.set_any_anchored_start(true); | |
| } | |
| if let Anchor::EndText = anchor { | |
| info.set_anchored_end(true); | |
| info.set_any_anchored_end(true); | |
| } | |
| Hir { | |
| kind: HirKind::Anchor(anchor), | |
| info: info, | |
| } | |
| } | |
| /// Creates a word boundary assertion HIR expression. | |
| pub fn word_boundary(word_boundary: WordBoundary) -> Hir { | |
| let mut info = HirInfo::new(); | |
| info.set_always_utf8(true); | |
| info.set_all_assertions(true); | |
| info.set_anchored_start(false); | |
| info.set_anchored_end(false); | |
| info.set_any_anchored_start(false); | |
| info.set_any_anchored_end(false); | |
| // A negated word boundary matches the empty string, but a normal | |
| // word boundary does not! | |
| info.set_match_empty(word_boundary.is_negated()); | |
| // Negated ASCII word boundaries can match invalid UTF-8. | |
| if let WordBoundary::AsciiNegate = word_boundary { | |
| info.set_always_utf8(false); | |
| } | |
| Hir { | |
| kind: HirKind::WordBoundary(word_boundary), | |
| info: info, | |
| } | |
| } | |
| /// Creates a repetition HIR expression. | |
| pub fn repetition(rep: Repetition) -> Hir { | |
| let mut info = HirInfo::new(); | |
| info.set_always_utf8(rep.hir.is_always_utf8()); | |
| info.set_all_assertions(rep.hir.is_all_assertions()); | |
| // If this operator can match the empty string, then it can never | |
| // be anchored. | |
| info.set_anchored_start( | |
| !rep.is_match_empty() && rep.hir.is_anchored_start() | |
| ); | |
| info.set_anchored_end( | |
| !rep.is_match_empty() && rep.hir.is_anchored_end() | |
| ); | |
| info.set_any_anchored_start(rep.hir.is_any_anchored_start()); | |
| info.set_any_anchored_end(rep.hir.is_any_anchored_end()); | |
| info.set_match_empty(rep.is_match_empty() || rep.hir.is_match_empty()); | |
| Hir { | |
| kind: HirKind::Repetition(rep), | |
| info: info, | |
| } | |
| } | |
| /// Creates a group HIR expression. | |
| pub fn group(group: Group) -> Hir { | |
| let mut info = HirInfo::new(); | |
| info.set_always_utf8(group.hir.is_always_utf8()); | |
| info.set_all_assertions(group.hir.is_all_assertions()); | |
| info.set_anchored_start(group.hir.is_anchored_start()); | |
| info.set_anchored_end(group.hir.is_anchored_end()); | |
| info.set_any_anchored_start(group.hir.is_any_anchored_start()); | |
| info.set_any_anchored_end(group.hir.is_any_anchored_end()); | |
| info.set_match_empty(group.hir.is_match_empty()); | |
| Hir { | |
| kind: HirKind::Group(group), | |
| info: info, | |
| } | |
| } | |
| /// Returns the concatenation of the given expressions. | |
| /// | |
| /// This flattens the concatenation as appropriate. | |
| pub fn concat(mut exprs: Vec<Hir>) -> Hir { | |
| match exprs.len() { | |
| 0 => Hir::empty(), | |
| 1 => exprs.pop().unwrap(), | |
| _ => { | |
| let mut info = HirInfo::new(); | |
| info.set_always_utf8(true); | |
| info.set_all_assertions(true); | |
| info.set_any_anchored_start(false); | |
| info.set_any_anchored_end(false); | |
| info.set_match_empty(true); | |
| // Some attributes require analyzing all sub-expressions. | |
| for e in &exprs { | |
| let x = info.is_always_utf8() && e.is_always_utf8(); | |
| info.set_always_utf8(x); | |
| let x = info.is_all_assertions() && e.is_all_assertions(); | |
| info.set_all_assertions(x); | |
| let x = | |
| info.is_any_anchored_start() | |
| || e.is_any_anchored_start(); | |
| info.set_any_anchored_start(x); | |
| let x = | |
| info.is_any_anchored_end() | |
| || e.is_any_anchored_end(); | |
| info.set_any_anchored_end(x); | |
| let x = info.is_match_empty() && e.is_match_empty(); | |
| info.set_match_empty(x); | |
| } | |
| // Anchored attributes require something slightly more | |
| // sophisticated. Normally, WLOG, to determine whether an | |
| // expression is anchored to the start, we'd only need to check | |
| // the first expression of a concatenation. However, | |
| // expressions like `$\b^` are still anchored to the start, | |
| // but the first expression in the concatenation *isn't* | |
| // anchored to the start. So the "first" expression to look at | |
| // is actually one that is either not an assertion or is | |
| // specifically the StartText assertion. | |
| info.set_anchored_start( | |
| exprs.iter() | |
| .take_while(|e| { | |
| e.is_anchored_start() || e.is_all_assertions() | |
| }) | |
| .any(|e| { | |
| e.is_anchored_start() | |
| })); | |
| // Similarly for the end anchor, but in reverse. | |
| info.set_anchored_end( | |
| exprs.iter() | |
| .rev() | |
| .take_while(|e| { | |
| e.is_anchored_end() || e.is_all_assertions() | |
| }) | |
| .any(|e| { | |
| e.is_anchored_end() | |
| })); | |
| Hir { | |
| kind: HirKind::Concat(exprs), | |
| info: info, | |
| } | |
| } | |
| } | |
| } | |
| /// Returns the alternation of the given expressions. | |
| /// | |
| /// This flattens the alternation as appropriate. | |
| pub fn alternation(mut exprs: Vec<Hir>) -> Hir { | |
| match exprs.len() { | |
| 0 => Hir::empty(), | |
| 1 => exprs.pop().unwrap(), | |
| _ => { | |
| let mut info = HirInfo::new(); | |
| info.set_always_utf8(true); | |
| info.set_all_assertions(true); | |
| info.set_anchored_start(true); | |
| info.set_anchored_end(true); | |
| info.set_any_anchored_start(false); | |
| info.set_any_anchored_end(false); | |
| info.set_match_empty(false); | |
| // Some attributes require analyzing all sub-expressions. | |
| for e in &exprs { | |
| let x = info.is_always_utf8() && e.is_always_utf8(); | |
| info.set_always_utf8(x); | |
| let x = info.is_all_assertions() && e.is_all_assertions(); | |
| info.set_all_assertions(x); | |
| let x = info.is_anchored_start() && e.is_anchored_start(); | |
| info.set_anchored_start(x); | |
| let x = info.is_anchored_end() && e.is_anchored_end(); | |
| info.set_anchored_end(x); | |
| let x = | |
| info.is_any_anchored_start() | |
| || e.is_any_anchored_start(); | |
| info.set_any_anchored_start(x); | |
| let x = | |
| info.is_any_anchored_end() | |
| || e.is_any_anchored_end(); | |
| info.set_any_anchored_end(x); | |
| let x = info.is_match_empty() || e.is_match_empty(); | |
| info.set_match_empty(x); | |
| } | |
| Hir { | |
| kind: HirKind::Alternation(exprs), | |
| info: info, | |
| } | |
| } | |
| } | |
| } | |
| /// Build an HIR expression for `.`. | |
| /// | |
| /// A `.` expression matches any character except for `\n`. To build an | |
| /// expression that matches any character, including `\n`, use the `any` | |
| /// method. | |
| /// | |
| /// If `bytes` is `true`, then this assumes characters are limited to a | |
| /// single byte. | |
| pub fn dot(bytes: bool) -> Hir { | |
| if bytes { | |
| let mut cls = ClassBytes::empty(); | |
| cls.push(ClassBytesRange::new(b'\0', b'\x09')); | |
| cls.push(ClassBytesRange::new(b'\x0B', b'\xFF')); | |
| Hir::class(Class::Bytes(cls)) | |
| } else { | |
| let mut cls = ClassUnicode::empty(); | |
| cls.push(ClassUnicodeRange::new('\0', '\x09')); | |
| cls.push(ClassUnicodeRange::new('\x0B', '\u{10FFFF}')); | |
| Hir::class(Class::Unicode(cls)) | |
| } | |
| } | |
| /// Build an HIR expression for `(?s).`. | |
| /// | |
| /// A `(?s).` expression matches any character, including `\n`. To build an | |
| /// expression that matches any character except for `\n`, then use the | |
| /// `dot` method. | |
| /// | |
| /// If `bytes` is `true`, then this assumes characters are limited to a | |
| /// single byte. | |
| pub fn any(bytes: bool) -> Hir { | |
| if bytes { | |
| let mut cls = ClassBytes::empty(); | |
| cls.push(ClassBytesRange::new(b'\0', b'\xFF')); | |
| Hir::class(Class::Bytes(cls)) | |
| } else { | |
| let mut cls = ClassUnicode::empty(); | |
| cls.push(ClassUnicodeRange::new('\0', '\u{10FFFF}')); | |
| Hir::class(Class::Unicode(cls)) | |
| } | |
| } | |
| /// Return true if and only if this HIR will always match valid UTF-8. | |
| /// | |
| /// When this returns false, then it is possible for this HIR expression | |
| /// to match invalid UTF-8. | |
| pub fn is_always_utf8(&self) -> bool { | |
| self.info.is_always_utf8() | |
| } | |
| /// Returns true if and only if this entire HIR expression is made up of | |
| /// zero-width assertions. | |
| /// | |
| /// This includes expressions like `^$\b\A\z` and even `((\b)+())*^`, but | |
| /// not `^a`. | |
| pub fn is_all_assertions(&self) -> bool { | |
| self.info.is_all_assertions() | |
| } | |
| /// Return true if and only if this HIR is required to match from the | |
| /// beginning of text. This includes expressions like `^foo`, `^(foo|bar)`, | |
| /// `^foo|^bar` but not `^foo|bar`. | |
| pub fn is_anchored_start(&self) -> bool { | |
| self.info.is_anchored_start() | |
| } | |
| /// Return true if and only if this HIR is required to match at the end | |
| /// of text. This includes expressions like `foo$`, `(foo|bar)$`, | |
| /// `foo$|bar$` but not `foo$|bar`. | |
| pub fn is_anchored_end(&self) -> bool { | |
| self.info.is_anchored_end() | |
| } | |
| /// Return true if and only if this HIR contains any sub-expression that | |
| /// is required to match at the beginning of text. Specifically, this | |
| /// returns true if the `^` symbol (when multiline mode is disabled) or the | |
| /// `\A` escape appear anywhere in the regex. | |
| pub fn is_any_anchored_start(&self) -> bool { | |
| self.info.is_any_anchored_start() | |
| } | |
| /// Return true if and only if this HIR contains any sub-expression that is | |
| /// required to match at the end of text. Specifically, this returns true | |
| /// if the `$` symbol (when multiline mode is disabled) or the `\z` escape | |
| /// appear anywhere in the regex. | |
| pub fn is_any_anchored_end(&self) -> bool { | |
| self.info.is_any_anchored_end() | |
| } | |
| /// Return true if and only if the empty string is part of the language | |
| /// matched by this regular expression. | |
| /// | |
| /// This includes `a*`, `a?b*`, `a{0}`, `()`, `()+`, `^$`, `a|b?`, `\B`, | |
| /// but not `a`, `a+` or `\b`. | |
| pub fn is_match_empty(&self) -> bool { | |
| self.info.is_match_empty() | |
| } | |
| } | |
| impl HirKind { | |
| /// Return true if and only if this HIR is the empty regular expression. | |
| /// | |
| /// Note that this is not defined inductively. That is, it only tests if | |
| /// this kind is the `Empty` variant. To get the inductive definition, | |
| /// use the `is_match_empty` method on [`Hir`](struct.Hir.html). | |
| pub fn is_empty(&self) -> bool { | |
| match *self { | |
| HirKind::Empty => true, | |
| _ => false, | |
| } | |
| } | |
| /// Returns true if and only if this kind has any (including possibly | |
| /// empty) subexpressions. | |
| pub fn has_subexprs(&self) -> bool { | |
| match *self { | |
| HirKind::Empty | |
| | HirKind::Literal(_) | |
| | HirKind::Class(_) | |
| | HirKind::Anchor(_) | |
| | HirKind::WordBoundary(_) => false, | |
| HirKind::Group(_) | |
| | HirKind::Repetition(_) | |
| | HirKind::Concat(_) | |
| | HirKind::Alternation(_) => true, | |
| } | |
| } | |
| } | |
| /// Print a display representation of this Hir. | |
| /// | |
| /// The result of this is a valid regular expression pattern string. | |
| /// | |
| /// This implementation uses constant stack space and heap space proportional | |
| /// to the size of the `Hir`. | |
| impl fmt::Display for Hir { | |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | |
| use hir::print::Printer; | |
| Printer::new().print(self, f) | |
| } | |
| } | |
| /// The high-level intermediate representation of a literal. | |
| /// | |
| /// A literal corresponds to a single character, where a character is either | |
| /// defined by a Unicode scalar value or an arbitrary byte. Unicode characters | |
| /// are preferred whenever possible. In particular, a `Byte` variant is only | |
| /// ever produced when it could match invalid UTF-8. | |
| #[derive(Clone, Debug, Eq, PartialEq)] | |
| pub enum Literal { | |
| /// A single character represented by a Unicode scalar value. | |
| Unicode(char), | |
| /// A single character represented by an arbitrary byte. | |
| Byte(u8), | |
| } | |
| impl Literal { | |
| /// Returns true if and only if this literal corresponds to a Unicode | |
| /// scalar value. | |
| pub fn is_unicode(&self) -> bool { | |
| match *self { | |
| Literal::Unicode(_) => true, | |
| Literal::Byte(b) if b <= 0x7F => true, | |
| Literal::Byte(_) => false, | |
| } | |
| } | |
| } | |
| /// The high-level intermediate representation of a character class. | |
| /// | |
| /// A character class corresponds to a set of characters. A character is either | |
| /// defined by a Unicode scalar value or a byte. Unicode characters are used | |
| /// by default, while bytes are used when Unicode mode (via the `u` flag) is | |
| /// disabled. | |
| /// | |
| /// A character class, regardless of its character type, is represented by a | |
| /// sequence of non-overlapping non-adjacent ranges of characters. | |
| /// | |
| /// Note that unlike [`Literal`](enum.Literal.html), a `Bytes` variant may | |
| /// be produced even when it exclusively matches valid UTF-8. This is because | |
| /// a `Bytes` variant represents an intention by the author of the regular | |
| /// expression to disable Unicode mode, which in turn impacts the semantics of | |
| /// case insensitive matching. For example, `(?i)k` and `(?i-u)k` will not | |
| /// match the same set of strings. | |
| #[derive(Clone, Debug, Eq, PartialEq)] | |
| pub enum Class { | |
| /// A set of characters represented by Unicode scalar values. | |
| Unicode(ClassUnicode), | |
| /// A set of characters represented by arbitrary bytes (one byte per | |
| /// character). | |
| Bytes(ClassBytes), | |
| } | |
| impl Class { | |
| /// Apply Unicode simple case folding to this character class, in place. | |
| /// The character class will be expanded to include all simple case folded | |
| /// character variants. | |
| /// | |
| /// If this is a byte oriented character class, then this will be limited | |
| /// to the ASCII ranges `A-Z` and `a-z`. | |
| pub fn case_fold_simple(&mut self) { | |
| match *self { | |
| Class::Unicode(ref mut x) => x.case_fold_simple(), | |
| Class::Bytes(ref mut x) => x.case_fold_simple(), | |
| } | |
| } | |
| /// Negate this character class in place. | |
| /// | |
| /// After completion, this character class will contain precisely the | |
| /// characters that weren't previously in the class. | |
| pub fn negate(&mut self) { | |
| match *self { | |
| Class::Unicode(ref mut x) => x.negate(), | |
| Class::Bytes(ref mut x) => x.negate(), | |
| } | |
| } | |
| /// Returns true if and only if this character class will only ever match | |
| /// valid UTF-8. | |
| /// | |
| /// A character class can match invalid UTF-8 only when the following | |
| /// conditions are met: | |
| /// | |
| /// 1. The translator was configured to permit generating an expression | |
| /// that can match invalid UTF-8. (By default, this is disabled.) | |
| /// 2. Unicode mode (via the `u` flag) was disabled either in the concrete | |
| /// syntax or in the parser builder. By default, Unicode mode is | |
| /// enabled. | |
| pub fn is_always_utf8(&self) -> bool { | |
| match *self { | |
| Class::Unicode(_) => true, | |
| Class::Bytes(ref x) => x.is_all_ascii(), | |
| } | |
| } | |
| } | |
| /// A set of characters represented by Unicode scalar values. | |
| #[derive(Clone, Debug, Eq, PartialEq)] | |
| pub struct ClassUnicode { | |
| set: IntervalSet<ClassUnicodeRange>, | |
| } | |
| impl ClassUnicode { | |
| /// Create a new class from a sequence of ranges. | |
| /// | |
| /// The given ranges do not need to be in any specific order, and ranges | |
| /// may overlap. | |
| pub fn new<I>(ranges: I) -> ClassUnicode | |
| where I: IntoIterator<Item=ClassUnicodeRange> | |
| { | |
| ClassUnicode { set: IntervalSet::new(ranges) } | |
| } | |
| /// Create a new class with no ranges. | |
| pub fn empty() -> ClassUnicode { | |
| ClassUnicode::new(vec![]) | |
| } | |
| /// Add a new range to this set. | |
| pub fn push(&mut self, range: ClassUnicodeRange) { | |
| self.set.push(range); | |
| } | |
| /// Return an iterator over all ranges in this class. | |
| /// | |
| /// The iterator yields ranges in ascending order. | |
| pub fn iter(&self) -> ClassUnicodeIter { | |
| ClassUnicodeIter(self.set.iter()) | |
| } | |
| /// Return the underlying ranges as a slice. | |
| pub fn ranges(&self) -> &[ClassUnicodeRange] { | |
| self.set.intervals() | |
| } | |
| /// Expand this character class such that it contains all case folded | |
| /// characters, according to Unicode's "simple" mapping. For example, if | |
| /// this class consists of the range `a-z`, then applying case folding will | |
| /// result in the class containing both the ranges `a-z` and `A-Z`. | |
| pub fn case_fold_simple(&mut self) { | |
| self.set.case_fold_simple(); | |
| } | |
| /// Negate this character class. | |
| /// | |
| /// For all `c` where `c` is a Unicode scalar value, if `c` was in this | |
| /// set, then it will not be in this set after negation. | |
| pub fn negate(&mut self) { | |
| self.set.negate(); | |
| } | |
| /// Union this character class with the given character class, in place. | |
| pub fn union(&mut self, other: &ClassUnicode) { | |
| self.set.union(&other.set); | |
| } | |
| /// Intersect this character class with the given character class, in | |
| /// place. | |
| pub fn intersect(&mut self, other: &ClassUnicode) { | |
| self.set.intersect(&other.set); | |
| } | |
| /// Subtract the given character class from this character class, in place. | |
| pub fn difference(&mut self, other: &ClassUnicode) { | |
| self.set.difference(&other.set); | |
| } | |
| /// Compute the symmetric difference of the given character classes, in | |
| /// place. | |
| /// | |
| /// This computes the symmetric difference of two character classes. This | |
| /// removes all elements in this class that are also in the given class, | |
| /// but all adds all elements from the given class that aren't in this | |
| /// class. That is, the class will contain all elements in either class, | |
| /// but will not contain any elements that are in both classes. | |
| pub fn symmetric_difference(&mut self, other: &ClassUnicode) { | |
| self.set.symmetric_difference(&other.set); | |
| } | |
| } | |
| /// An iterator over all ranges in a Unicode character class. | |
| /// | |
| /// The lifetime `'a` refers to the lifetime of the underlying class. | |
| #[derive(Debug)] | |
| pub struct ClassUnicodeIter<'a>(IntervalSetIter<'a, ClassUnicodeRange>); | |
| impl<'a> Iterator for ClassUnicodeIter<'a> { | |
| type Item = &'a ClassUnicodeRange; | |
| fn next(&mut self) -> Option<&'a ClassUnicodeRange> { | |
| self.0.next() | |
| } | |
| } | |
| /// A single range of characters represented by Unicode scalar values. | |
| /// | |
| /// The range is closed. That is, the start and end of the range are included | |
| /// in the range. | |
| #[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)] | |
| pub struct ClassUnicodeRange { | |
| start: char, | |
| end: char, | |
| } | |
| impl fmt::Debug for ClassUnicodeRange { | |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | |
| let start = | |
| if !self.start.is_whitespace() && !self.start.is_control() { | |
| self.start.to_string() | |
| } else { | |
| format!("0x{:X}", self.start as u32) | |
| }; | |
| let end = | |
| if !self.end.is_whitespace() && !self.end.is_control() { | |
| self.end.to_string() | |
| } else { | |
| format!("0x{:X}", self.end as u32) | |
| }; | |
| f.debug_struct("ClassUnicodeRange") | |
| .field("start", &start) | |
| .field("end", &end) | |
| .finish() | |
| } | |
| } | |
| impl Interval for ClassUnicodeRange { | |
| type Bound = char; | |
| #[inline] fn lower(&self) -> char { self.start } | |
| #[inline] fn upper(&self) -> char { self.end } | |
| #[inline] fn set_lower(&mut self, bound: char) { self.start = bound; } | |
| #[inline] fn set_upper(&mut self, bound: char) { self.end = bound; } | |
| /// Apply simple case folding to this Unicode scalar value range. | |
| /// | |
| /// Additional ranges are appended to the given vector. Canonical ordering | |
| /// is *not* maintained in the given vector. | |
| fn case_fold_simple(&self, ranges: &mut Vec<ClassUnicodeRange>) { | |
| if !unicode::contains_simple_case_mapping(self.start, self.end) { | |
| return; | |
| } | |
| let start = self.start as u32; | |
| let end = (self.end as u32).saturating_add(1); | |
| let mut next_simple_cp = None; | |
| for cp in (start..end).filter_map(char::from_u32) { | |
| if next_simple_cp.map_or(false, |next| cp < next) { | |
| continue; | |
| } | |
| let it = match unicode::simple_fold(cp) { | |
| Ok(it) => it, | |
| Err(next) => { | |
| next_simple_cp = next; | |
| continue; | |
| } | |
| }; | |
| for cp_folded in it { | |
| ranges.push(ClassUnicodeRange::new(cp_folded, cp_folded)); | |
| } | |
| } | |
| } | |
| } | |
| impl ClassUnicodeRange { | |
| /// Create a new Unicode scalar value range for a character class. | |
| /// | |
| /// The returned range is always in a canonical form. That is, the range | |
| /// returned always satisfies the invariant that `start <= end`. | |
| pub fn new(start: char, end: char) -> ClassUnicodeRange { | |
| ClassUnicodeRange::create(start, end) | |
| } | |
| /// Return the start of this range. | |
| /// | |
| /// The start of a range is always less than or equal to the end of the | |
| /// range. | |
| pub fn start(&self) -> char { | |
| self.start | |
| } | |
| /// Return the end of this range. | |
| /// | |
| /// The end of a range is always greater than or equal to the start of the | |
| /// range. | |
| pub fn end(&self) -> char { | |
| self.end | |
| } | |
| } | |
| /// A set of characters represented by arbitrary bytes (where one byte | |
| /// corresponds to one character). | |
| #[derive(Clone, Debug, Eq, PartialEq)] | |
| pub struct ClassBytes { | |
| set: IntervalSet<ClassBytesRange>, | |
| } | |
| impl ClassBytes { | |
| /// Create a new class from a sequence of ranges. | |
| /// | |
| /// The given ranges do not need to be in any specific order, and ranges | |
| /// may overlap. | |
| pub fn new<I>(ranges: I) -> ClassBytes | |
| where I: IntoIterator<Item=ClassBytesRange> | |
| { | |
| ClassBytes { set: IntervalSet::new(ranges) } | |
| } | |
| /// Create a new class with no ranges. | |
| pub fn empty() -> ClassBytes { | |
| ClassBytes::new(vec![]) | |
| } | |
| /// Add a new range to this set. | |
| pub fn push(&mut self, range: ClassBytesRange) { | |
| self.set.push(range); | |
| } | |
| /// Return an iterator over all ranges in this class. | |
| /// | |
| /// The iterator yields ranges in ascending order. | |
| pub fn iter(&self) -> ClassBytesIter { | |
| ClassBytesIter(self.set.iter()) | |
| } | |
| /// Return the underlying ranges as a slice. | |
| pub fn ranges(&self) -> &[ClassBytesRange] { | |
| self.set.intervals() | |
| } | |
| /// Expand this character class such that it contains all case folded | |
| /// characters. For example, if this class consists of the range `a-z`, | |
| /// then applying case folding will result in the class containing both the | |
| /// ranges `a-z` and `A-Z`. | |
| /// | |
| /// Note that this only applies ASCII case folding, which is limited to the | |
| /// characters `a-z` and `A-Z`. | |
| pub fn case_fold_simple(&mut self) { | |
| self.set.case_fold_simple(); | |
| } | |
| /// Negate this byte class. | |
| /// | |
| /// For all `b` where `b` is a any byte, if `b` was in this set, then it | |
| /// will not be in this set after negation. | |
| pub fn negate(&mut self) { | |
| self.set.negate(); | |
| } | |
| /// Union this byte class with the given byte class, in place. | |
| pub fn union(&mut self, other: &ClassBytes) { | |
| self.set.union(&other.set); | |
| } | |
| /// Intersect this byte class with the given byte class, in place. | |
| pub fn intersect(&mut self, other: &ClassBytes) { | |
| self.set.intersect(&other.set); | |
| } | |
| /// Subtract the given byte class from this byte class, in place. | |
| pub fn difference(&mut self, other: &ClassBytes) { | |
| self.set.difference(&other.set); | |
| } | |
| /// Compute the symmetric difference of the given byte classes, in place. | |
| /// | |
| /// This computes the symmetric difference of two byte classes. This | |
| /// removes all elements in this class that are also in the given class, | |
| /// but all adds all elements from the given class that aren't in this | |
| /// class. That is, the class will contain all elements in either class, | |
| /// but will not contain any elements that are in both classes. | |
| pub fn symmetric_difference(&mut self, other: &ClassBytes) { | |
| self.set.symmetric_difference(&other.set); | |
| } | |
| /// Returns true if and only if this character class will either match | |
| /// nothing or only ASCII bytes. Stated differently, this returns false | |
| /// if and only if this class contains a non-ASCII byte. | |
| pub fn is_all_ascii(&self) -> bool { | |
| self.set.intervals().last().map_or(true, |r| r.end <= 0x7F) | |
| } | |
| } | |
| /// An iterator over all ranges in a byte character class. | |
| /// | |
| /// The lifetime `'a` refers to the lifetime of the underlying class. | |
| #[derive(Debug)] | |
| pub struct ClassBytesIter<'a>(IntervalSetIter<'a, ClassBytesRange>); | |
| impl<'a> Iterator for ClassBytesIter<'a> { | |
| type Item = &'a ClassBytesRange; | |
| fn next(&mut self) -> Option<&'a ClassBytesRange> { | |
| self.0.next() | |
| } | |
| } | |
| /// A single range of characters represented by arbitrary bytes. | |
| /// | |
| /// The range is closed. That is, the start and end of the range are included | |
| /// in the range. | |
| #[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)] | |
| pub struct ClassBytesRange { | |
| start: u8, | |
| end: u8, | |
| } | |
| impl Interval for ClassBytesRange { | |
| type Bound = u8; | |
| #[inline] fn lower(&self) -> u8 { self.start } | |
| #[inline] fn upper(&self) -> u8 { self.end } | |
| #[inline] fn set_lower(&mut self, bound: u8) { self.start = bound; } | |
| #[inline] fn set_upper(&mut self, bound: u8) { self.end = bound; } | |
| /// Apply simple case folding to this byte range. Only ASCII case mappings | |
| /// (for a-z) are applied. | |
| /// | |
| /// Additional ranges are appended to the given vector. Canonical ordering | |
| /// is *not* maintained in the given vector. | |
| fn case_fold_simple(&self, ranges: &mut Vec<ClassBytesRange>) { | |
| if !ClassBytesRange::new(b'a', b'z').is_intersection_empty(self) { | |
| let lower = cmp::max(self.start, b'a'); | |
| let upper = cmp::min(self.end, b'z'); | |
| ranges.push(ClassBytesRange::new(lower - 32, upper - 32)); | |
| } | |
| if !ClassBytesRange::new(b'A', b'Z').is_intersection_empty(self) { | |
| let lower = cmp::max(self.start, b'A'); | |
| let upper = cmp::min(self.end, b'Z'); | |
| ranges.push(ClassBytesRange::new(lower + 32, upper + 32)); | |
| } | |
| } | |
| } | |
| impl ClassBytesRange { | |
| /// Create a new byte range for a character class. | |
| /// | |
| /// The returned range is always in a canonical form. That is, the range | |
| /// returned always satisfies the invariant that `start <= end`. | |
| pub fn new(start: u8, end: u8) -> ClassBytesRange { | |
| ClassBytesRange::create(start, end) | |
| } | |
| /// Return the start of this range. | |
| /// | |
| /// The start of a range is always less than or equal to the end of the | |
| /// range. | |
| pub fn start(&self) -> u8 { | |
| self.start | |
| } | |
| /// Return the end of this range. | |
| /// | |
| /// The end of a range is always greater than or equal to the start of the | |
| /// range. | |
| pub fn end(&self) -> u8 { | |
| self.end | |
| } | |
| } | |
| impl fmt::Debug for ClassBytesRange { | |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | |
| let mut debug = f.debug_struct("ClassBytesRange"); | |
| if self.start <= 0x7F { | |
| debug.field("start", &(self.start as char)); | |
| } else { | |
| debug.field("start", &self.start); | |
| } | |
| if self.end <= 0x7F { | |
| debug.field("end", &(self.end as char)); | |
| } else { | |
| debug.field("end", &self.end); | |
| } | |
| debug.finish() | |
| } | |
| } | |
| /// The high-level intermediate representation for an anchor assertion. | |
| /// | |
| /// A matching anchor assertion is always zero-length. | |
| #[derive(Clone, Debug, Eq, PartialEq)] | |
| pub enum Anchor { | |
| /// Match the beginning of a line or the beginning of text. Specifically, | |
| /// this matches at the starting position of the input, or at the position | |
| /// immediately following a `\n` character. | |
| StartLine, | |
| /// Match the end of a line or the end of text. Specifically, | |
| /// this matches at the end position of the input, or at the position | |
| /// immediately preceding a `\n` character. | |
| EndLine, | |
| /// Match the beginning of text. Specifically, this matches at the starting | |
| /// position of the input. | |
| StartText, | |
| /// Match the end of text. Specifically, this matches at the ending | |
| /// position of the input. | |
| EndText, | |
| } | |
| /// The high-level intermediate representation for a word-boundary assertion. | |
| /// | |
| /// A matching word boundary assertion is always zero-length. | |
| #[derive(Clone, Debug, Eq, PartialEq)] | |
| pub enum WordBoundary { | |
| /// Match a Unicode-aware word boundary. That is, this matches a position | |
| /// where the left adjacent character and right adjacent character | |
| /// correspond to a word and non-word or a non-word and word character. | |
| Unicode, | |
| /// Match a Unicode-aware negation of a word boundary. | |
| UnicodeNegate, | |
| /// Match an ASCII-only word boundary. That is, this matches a position | |
| /// where the left adjacent character and right adjacent character | |
| /// correspond to a word and non-word or a non-word and word character. | |
| Ascii, | |
| /// Match an ASCII-only negation of a word boundary. | |
| AsciiNegate, | |
| } | |
| impl WordBoundary { | |
| /// Returns true if and only if this word boundary assertion is negated. | |
| pub fn is_negated(&self) -> bool { | |
| match *self { | |
| WordBoundary::Unicode | WordBoundary::Ascii => false, | |
| WordBoundary::UnicodeNegate | WordBoundary::AsciiNegate => true, | |
| } | |
| } | |
| } | |
| /// The high-level intermediate representation for a group. | |
| /// | |
| /// This represents one of three possible group types: | |
| /// | |
| /// 1. A non-capturing group (e.g., `(?:expr)`). | |
| /// 2. A capturing group (e.g., `(expr)`). | |
| /// 3. A named capturing group (e.g., `(?P<name>expr)`). | |
| #[derive(Clone, Debug, Eq, PartialEq)] | |
| pub struct Group { | |
| /// The kind of this group. If it is a capturing group, then the kind | |
| /// contains the capture group index (and the name, if it is a named | |
| /// group). | |
| pub kind: GroupKind, | |
| /// The expression inside the capturing group, which may be empty. | |
| pub hir: Box<Hir>, | |
| } | |
| /// The kind of group. | |
| #[derive(Clone, Debug, Eq, PartialEq)] | |
| pub enum GroupKind { | |
| /// A normal unnamed capturing group. | |
| /// | |
| /// The value is the capture index of the group. | |
| CaptureIndex(u32), | |
| /// A named capturing group. | |
| CaptureName { | |
| /// The name of the group. | |
| name: String, | |
| /// The capture index of the group. | |
| index: u32, | |
| }, | |
| /// A non-capturing group. | |
| NonCapturing, | |
| } | |
| /// The high-level intermediate representation of a repetition operator. | |
| /// | |
| /// A repetition operator permits the repetition of an arbitrary | |
| /// sub-expression. | |
| #[derive(Clone, Debug, Eq, PartialEq)] | |
| pub struct Repetition { | |
| /// The kind of this repetition operator. | |
| pub kind: RepetitionKind, | |
| /// Whether this repetition operator is greedy or not. A greedy operator | |
| /// will match as much as it can. A non-greedy operator will match as | |
| /// little as it can. | |
| /// | |
| /// Typically, operators are greedy by default and are only non-greedy when | |
| /// a `?` suffix is used, e.g., `(expr)*` is greedy while `(expr)*?` is | |
| /// not. However, this can be inverted via the `U` "ungreedy" flag. | |
| pub greedy: bool, | |
| /// The expression being repeated. | |
| pub hir: Box<Hir>, | |
| } | |
| impl Repetition { | |
| /// Returns true if and only if this repetition operator makes it possible | |
| /// to match the empty string. | |
| /// | |
| /// Note that this is not defined inductively. For example, while `a*` | |
| /// will report `true`, `()+` will not, even though `()` matches the empty | |
| /// string and one or more occurrences of something that matches the empty | |
| /// string will always match the empty string. In order to get the | |
| /// inductive definition, see the corresponding method on | |
| /// [`Hir`](struct.Hir.html). | |
| pub fn is_match_empty(&self) -> bool { | |
| match self.kind { | |
| RepetitionKind::ZeroOrOne => true, | |
| RepetitionKind::ZeroOrMore => true, | |
| RepetitionKind::OneOrMore => false, | |
| RepetitionKind::Range(RepetitionRange::Exactly(m)) => m == 0, | |
| RepetitionKind::Range(RepetitionRange::AtLeast(m)) => m == 0, | |
| RepetitionKind::Range(RepetitionRange::Bounded(m, _)) => m == 0, | |
| } | |
| } | |
| } | |
| /// The kind of a repetition operator. | |
| #[derive(Clone, Debug, Eq, PartialEq)] | |
| pub enum RepetitionKind { | |
| /// Matches a sub-expression zero or one times. | |
| ZeroOrOne, | |
| /// Matches a sub-expression zero or more times. | |
| ZeroOrMore, | |
| /// Matches a sub-expression one or more times. | |
| OneOrMore, | |
| /// Matches a sub-expression within a bounded range of times. | |
| Range(RepetitionRange), | |
| } | |
| /// The kind of a counted repetition operator. | |
| #[derive(Clone, Debug, Eq, PartialEq)] | |
| pub enum RepetitionRange { | |
| /// Matches a sub-expression exactly this many times. | |
| Exactly(u32), | |
| /// Matches a sub-expression at least this many times. | |
| AtLeast(u32), | |
| /// Matches a sub-expression at least `m` times and at most `n` times. | |
| Bounded(u32, u32), | |
| } | |
| /// A custom `Drop` impl is used for `HirKind` such that it uses constant stack | |
| /// space but heap space proportional to the depth of the total `Hir`. | |
| impl Drop for Hir { | |
| fn drop(&mut self) { | |
| use std::mem; | |
| match *self.kind() { | |
| HirKind::Empty | |
| | HirKind::Literal(_) | |
| | HirKind::Class(_) | |
| | HirKind::Anchor(_) | |
| | HirKind::WordBoundary(_) => return, | |
| HirKind::Group(ref x) if !x.hir.kind.has_subexprs() => return, | |
| HirKind::Repetition(ref x) if !x.hir.kind.has_subexprs() => return, | |
| HirKind::Concat(ref x) if x.is_empty() => return, | |
| HirKind::Alternation(ref x) if x.is_empty() => return, | |
| _ => {} | |
| } | |
| let mut stack = vec![mem::replace(self, Hir::empty())]; | |
| while let Some(mut expr) = stack.pop() { | |
| match expr.kind { | |
| HirKind::Empty | |
| | HirKind::Literal(_) | |
| | HirKind::Class(_) | |
| | HirKind::Anchor(_) | |
| | HirKind::WordBoundary(_) => {} | |
| HirKind::Group(ref mut x) => { | |
| stack.push(mem::replace(&mut x.hir, Hir::empty())); | |
| } | |
| HirKind::Repetition(ref mut x) => { | |
| stack.push(mem::replace(&mut x.hir, Hir::empty())); | |
| } | |
| HirKind::Concat(ref mut x) => { | |
| stack.extend(x.drain(..)); | |
| } | |
| HirKind::Alternation(ref mut x) => { | |
| stack.extend(x.drain(..)); | |
| } | |
| } | |
| } | |
| } | |
| } | |
| /// A type that documents various attributes of an HIR expression. | |
| /// | |
| /// These attributes are typically defined inductively on the HIR. | |
| #[derive(Clone, Debug, Eq, PartialEq)] | |
| struct HirInfo { | |
| /// Represent yes/no questions by a bitfield to conserve space, since | |
| /// this is included in every HIR expression. | |
| /// | |
| /// If more attributes need to be added, it is OK to increase the size of | |
| /// this as appropriate. | |
| bools: u8, | |
| } | |
| // A simple macro for defining bitfield accessors/mutators. | |
| macro_rules! define_bool { | |
| ($bit:expr, $is_fn_name:ident, $set_fn_name:ident) => { | |
| fn $is_fn_name(&self) -> bool { | |
| self.bools & (0b1 << $bit) > 0 | |
| } | |
| fn $set_fn_name(&mut self, yes: bool) { | |
| if yes { | |
| self.bools |= 1 << $bit; | |
| } else { | |
| self.bools &= !(1 << $bit); | |
| } | |
| } | |
| } | |
| } | |
| impl HirInfo { | |
| fn new() -> HirInfo { | |
| HirInfo { | |
| bools: 0, | |
| } | |
| } | |
| define_bool!(0, is_always_utf8, set_always_utf8); | |
| define_bool!(1, is_all_assertions, set_all_assertions); | |
| define_bool!(2, is_anchored_start, set_anchored_start); | |
| define_bool!(3, is_anchored_end, set_anchored_end); | |
| define_bool!(4, is_any_anchored_start, set_any_anchored_start); | |
| define_bool!(5, is_any_anchored_end, set_any_anchored_end); | |
| define_bool!(6, is_match_empty, set_match_empty); | |
| } | |
| #[cfg(test)] | |
| mod tests { | |
| use super::*; | |
| fn uclass(ranges: &[(char, char)]) -> ClassUnicode { | |
| let ranges: Vec<ClassUnicodeRange> = ranges | |
| .iter() | |
| .map(|&(s, e)| ClassUnicodeRange::new(s, e)) | |
| .collect(); | |
| ClassUnicode::new(ranges) | |
| } | |
| fn bclass(ranges: &[(u8, u8)]) -> ClassBytes { | |
| let ranges: Vec<ClassBytesRange> = ranges | |
| .iter() | |
| .map(|&(s, e)| ClassBytesRange::new(s, e)) | |
| .collect(); | |
| ClassBytes::new(ranges) | |
| } | |
| fn uranges(cls: &ClassUnicode) -> Vec<(char, char)> { | |
| cls.iter().map(|x| (x.start(), x.end())).collect() | |
| } | |
| fn ucasefold(cls: &ClassUnicode) -> ClassUnicode { | |
| let mut cls_ = cls.clone(); | |
| cls_.case_fold_simple(); | |
| cls_ | |
| } | |
| fn uunion(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode { | |
| let mut cls_ = cls1.clone(); | |
| cls_.union(cls2); | |
| cls_ | |
| } | |
| fn uintersect(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode { | |
| let mut cls_ = cls1.clone(); | |
| cls_.intersect(cls2); | |
| cls_ | |
| } | |
| fn udifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode { | |
| let mut cls_ = cls1.clone(); | |
| cls_.difference(cls2); | |
| cls_ | |
| } | |
| fn usymdifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode { | |
| let mut cls_ = cls1.clone(); | |
| cls_.symmetric_difference(cls2); | |
| cls_ | |
| } | |
| fn unegate(cls: &ClassUnicode) -> ClassUnicode { | |
| let mut cls_ = cls.clone(); | |
| cls_.negate(); | |
| cls_ | |
| } | |
| fn branges(cls: &ClassBytes) -> Vec<(u8, u8)> { | |
| cls.iter().map(|x| (x.start(), x.end())).collect() | |
| } | |
| fn bcasefold(cls: &ClassBytes) -> ClassBytes { | |
| let mut cls_ = cls.clone(); | |
| cls_.case_fold_simple(); | |
| cls_ | |
| } | |
| fn bunion(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { | |
| let mut cls_ = cls1.clone(); | |
| cls_.union(cls2); | |
| cls_ | |
| } | |
| fn bintersect(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { | |
| let mut cls_ = cls1.clone(); | |
| cls_.intersect(cls2); | |
| cls_ | |
| } | |
| fn bdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { | |
| let mut cls_ = cls1.clone(); | |
| cls_.difference(cls2); | |
| cls_ | |
| } | |
| fn bsymdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { | |
| let mut cls_ = cls1.clone(); | |
| cls_.symmetric_difference(cls2); | |
| cls_ | |
| } | |
| fn bnegate(cls: &ClassBytes) -> ClassBytes { | |
| let mut cls_ = cls.clone(); | |
| cls_.negate(); | |
| cls_ | |
| } | |
| #[test] | |
| fn class_range_canonical_unicode() { | |
| let range = ClassUnicodeRange::new('\u{00FF}', '\0'); | |
| assert_eq!('\0', range.start()); | |
| assert_eq!('\u{00FF}', range.end()); | |
| } | |
| #[test] | |
| fn class_range_canonical_bytes() { | |
| let range = ClassBytesRange::new(b'\xFF', b'\0'); | |
| assert_eq!(b'\0', range.start()); | |
| assert_eq!(b'\xFF', range.end()); | |
| } | |
| #[test] | |
| fn class_canonicalize_unicode() { | |
| let cls = uclass(&[('a', 'c'), ('x', 'z')]); | |
| let expected = vec![('a', 'c'), ('x', 'z')]; | |
| assert_eq!(expected, uranges(&cls)); | |
| let cls = uclass(&[('x', 'z'), ('a', 'c')]); | |
| let expected = vec![('a', 'c'), ('x', 'z')]; | |
| assert_eq!(expected, uranges(&cls)); | |
| let cls = uclass(&[('x', 'z'), ('w', 'y')]); | |
| let expected = vec![('w', 'z')]; | |
| assert_eq!(expected, uranges(&cls)); | |
| let cls = uclass(&[ | |
| ('c', 'f'), ('a', 'g'), ('d', 'j'), ('a', 'c'), | |
| ('m', 'p'), ('l', 's'), | |
| ]); | |
| let expected = vec![('a', 'j'), ('l', 's')]; | |
| assert_eq!(expected, uranges(&cls)); | |
| let cls = uclass(&[('x', 'z'), ('u', 'w')]); | |
| let expected = vec![('u', 'z')]; | |
| assert_eq!(expected, uranges(&cls)); | |
| let cls = uclass(&[('\x00', '\u{10FFFF}'), ('\x00', '\u{10FFFF}')]); | |
| let expected = vec![('\x00', '\u{10FFFF}')]; | |
| assert_eq!(expected, uranges(&cls)); | |
| let cls = uclass(&[('a', 'a'), ('b', 'b')]); | |
| let expected = vec![('a', 'b')]; | |
| assert_eq!(expected, uranges(&cls)); | |
| } | |
| #[test] | |
| fn class_canonicalize_bytes() { | |
| let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]); | |
| let expected = vec![(b'a', b'c'), (b'x', b'z')]; | |
| assert_eq!(expected, branges(&cls)); | |
| let cls = bclass(&[(b'x', b'z'), (b'a', b'c')]); | |
| let expected = vec![(b'a', b'c'), (b'x', b'z')]; | |
| assert_eq!(expected, branges(&cls)); | |
| let cls = bclass(&[(b'x', b'z'), (b'w', b'y')]); | |
| let expected = vec![(b'w', b'z')]; | |
| assert_eq!(expected, branges(&cls)); | |
| let cls = bclass(&[ | |
| (b'c', b'f'), (b'a', b'g'), (b'd', b'j'), (b'a', b'c'), | |
| (b'm', b'p'), (b'l', b's'), | |
| ]); | |
| let expected = vec![(b'a', b'j'), (b'l', b's')]; | |
| assert_eq!(expected, branges(&cls)); | |
| let cls = bclass(&[(b'x', b'z'), (b'u', b'w')]); | |
| let expected = vec![(b'u', b'z')]; | |
| assert_eq!(expected, branges(&cls)); | |
| let cls = bclass(&[(b'\x00', b'\xFF'), (b'\x00', b'\xFF')]); | |
| let expected = vec![(b'\x00', b'\xFF')]; | |
| assert_eq!(expected, branges(&cls)); | |
| let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]); | |
| let expected = vec![(b'a', b'b')]; | |
| assert_eq!(expected, branges(&cls)); | |
| } | |
| #[test] | |
| fn class_case_fold_unicode() { | |
| let cls = uclass(&[ | |
| ('C', 'F'), ('A', 'G'), ('D', 'J'), ('A', 'C'), | |
| ('M', 'P'), ('L', 'S'), ('c', 'f'), | |
| ]); | |
| let expected = uclass(&[ | |
| ('A', 'J'), ('L', 'S'), | |
| ('a', 'j'), ('l', 's'), | |
| ('\u{17F}', '\u{17F}'), | |
| ]); | |
| assert_eq!(expected, ucasefold(&cls)); | |
| let cls = uclass(&[('A', 'Z')]); | |
| let expected = uclass(&[ | |
| ('A', 'Z'), ('a', 'z'), | |
| ('\u{17F}', '\u{17F}'), | |
| ('\u{212A}', '\u{212A}'), | |
| ]); | |
| assert_eq!(expected, ucasefold(&cls)); | |
| let cls = uclass(&[('a', 'z')]); | |
| let expected = uclass(&[ | |
| ('A', 'Z'), ('a', 'z'), | |
| ('\u{17F}', '\u{17F}'), | |
| ('\u{212A}', '\u{212A}'), | |
| ]); | |
| assert_eq!(expected, ucasefold(&cls)); | |
| let cls = uclass(&[('A', 'A'), ('_', '_')]); | |
| let expected = uclass(&[('A', 'A'), ('_', '_'), ('a', 'a')]); | |
| assert_eq!(expected, ucasefold(&cls)); | |
| let cls = uclass(&[('A', 'A'), ('=', '=')]); | |
| let expected = uclass(&[('=', '='), ('A', 'A'), ('a', 'a')]); | |
| assert_eq!(expected, ucasefold(&cls)); | |
| let cls = uclass(&[('\x00', '\x10')]); | |
| assert_eq!(cls, ucasefold(&cls)); | |
| let cls = uclass(&[('k', 'k')]); | |
| let expected = uclass(&[ | |
| ('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}'), | |
| ]); | |
| assert_eq!(expected, ucasefold(&cls)); | |
| let cls = uclass(&[('@', '@')]); | |
| assert_eq!(cls, ucasefold(&cls)); | |
| } | |
| #[test] | |
| fn class_case_fold_bytes() { | |
| let cls = bclass(&[ | |
| (b'C', b'F'), (b'A', b'G'), (b'D', b'J'), (b'A', b'C'), | |
| (b'M', b'P'), (b'L', b'S'), (b'c', b'f'), | |
| ]); | |
| let expected = bclass(&[ | |
| (b'A', b'J'), (b'L', b'S'), | |
| (b'a', b'j'), (b'l', b's'), | |
| ]); | |
| assert_eq!(expected, bcasefold(&cls)); | |
| let cls = bclass(&[(b'A', b'Z')]); | |
| let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]); | |
| assert_eq!(expected, bcasefold(&cls)); | |
| let cls = bclass(&[(b'a', b'z')]); | |
| let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]); | |
| assert_eq!(expected, bcasefold(&cls)); | |
| let cls = bclass(&[(b'A', b'A'), (b'_', b'_')]); | |
| let expected = bclass(&[(b'A', b'A'), (b'_', b'_'), (b'a', b'a')]); | |
| assert_eq!(expected, bcasefold(&cls)); | |
| let cls = bclass(&[(b'A', b'A'), (b'=', b'=')]); | |
| let expected = bclass(&[(b'=', b'='), (b'A', b'A'), (b'a', b'a')]); | |
| assert_eq!(expected, bcasefold(&cls)); | |
| let cls = bclass(&[(b'\x00', b'\x10')]); | |
| assert_eq!(cls, bcasefold(&cls)); | |
| let cls = bclass(&[(b'k', b'k')]); | |
| let expected = bclass(&[(b'K', b'K'), (b'k', b'k')]); | |
| assert_eq!(expected, bcasefold(&cls)); | |
| let cls = bclass(&[(b'@', b'@')]); | |
| assert_eq!(cls, bcasefold(&cls)); | |
| } | |
| #[test] | |
| fn class_negate_unicode() { | |
| let cls = uclass(&[('a', 'a')]); | |
| let expected = uclass(&[('\x00', '\x60'), ('\x62', '\u{10FFFF}')]); | |
| assert_eq!(expected, unegate(&cls)); | |
| let cls = uclass(&[('a', 'a'), ('b', 'b')]); | |
| let expected = uclass(&[('\x00', '\x60'), ('\x63', '\u{10FFFF}')]); | |
| assert_eq!(expected, unegate(&cls)); | |
| let cls = uclass(&[('a', 'c'), ('x', 'z')]); | |
| let expected = uclass(&[ | |
| ('\x00', '\x60'), ('\x64', '\x77'), ('\x7B', '\u{10FFFF}'), | |
| ]); | |
| assert_eq!(expected, unegate(&cls)); | |
| let cls = uclass(&[('\x00', 'a')]); | |
| let expected = uclass(&[('\x62', '\u{10FFFF}')]); | |
| assert_eq!(expected, unegate(&cls)); | |
| let cls = uclass(&[('a', '\u{10FFFF}')]); | |
| let expected = uclass(&[('\x00', '\x60')]); | |
| assert_eq!(expected, unegate(&cls)); | |
| let cls = uclass(&[('\x00', '\u{10FFFF}')]); | |
| let expected = uclass(&[]); | |
| assert_eq!(expected, unegate(&cls)); | |
| let cls = uclass(&[]); | |
| let expected = uclass(&[('\x00', '\u{10FFFF}')]); | |
| assert_eq!(expected, unegate(&cls)); | |
| let cls = uclass(&[ | |
| ('\x00', '\u{10FFFD}'), ('\u{10FFFF}', '\u{10FFFF}'), | |
| ]); | |
| let expected = uclass(&[('\u{10FFFE}', '\u{10FFFE}')]); | |
| assert_eq!(expected, unegate(&cls)); | |
| let cls = uclass(&[('\x00', '\u{D7FF}')]); | |
| let expected = uclass(&[('\u{E000}', '\u{10FFFF}')]); | |
| assert_eq!(expected, unegate(&cls)); | |
| let cls = uclass(&[('\x00', '\u{D7FE}')]); | |
| let expected = uclass(&[('\u{D7FF}', '\u{10FFFF}')]); | |
| assert_eq!(expected, unegate(&cls)); | |
| let cls = uclass(&[('\u{E000}', '\u{10FFFF}')]); | |
| let expected = uclass(&[('\x00', '\u{D7FF}')]); | |
| assert_eq!(expected, unegate(&cls)); | |
| let cls = uclass(&[('\u{E001}', '\u{10FFFF}')]); | |
| let expected = uclass(&[('\x00', '\u{E000}')]); | |
| assert_eq!(expected, unegate(&cls)); | |
| } | |
| #[test] | |
| fn class_negate_bytes() { | |
| let cls = bclass(&[(b'a', b'a')]); | |
| let expected = bclass(&[(b'\x00', b'\x60'), (b'\x62', b'\xFF')]); | |
| assert_eq!(expected, bnegate(&cls)); | |
| let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]); | |
| let expected = bclass(&[(b'\x00', b'\x60'), (b'\x63', b'\xFF')]); | |
| assert_eq!(expected, bnegate(&cls)); | |
| let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]); | |
| let expected = bclass(&[ | |
| (b'\x00', b'\x60'), (b'\x64', b'\x77'), (b'\x7B', b'\xFF'), | |
| ]); | |
| assert_eq!(expected, bnegate(&cls)); | |
| let cls = bclass(&[(b'\x00', b'a')]); | |
| let expected = bclass(&[(b'\x62', b'\xFF')]); | |
| assert_eq!(expected, bnegate(&cls)); | |
| let cls = bclass(&[(b'a', b'\xFF')]); | |
| let expected = bclass(&[(b'\x00', b'\x60')]); | |
| assert_eq!(expected, bnegate(&cls)); | |
| let cls = bclass(&[(b'\x00', b'\xFF')]); | |
| let expected = bclass(&[]); | |
| assert_eq!(expected, bnegate(&cls)); | |
| let cls = bclass(&[]); | |
| let expected = bclass(&[(b'\x00', b'\xFF')]); | |
| assert_eq!(expected, bnegate(&cls)); | |
| let cls = bclass(&[(b'\x00', b'\xFD'), (b'\xFF', b'\xFF')]); | |
| let expected = bclass(&[(b'\xFE', b'\xFE')]); | |
| assert_eq!(expected, bnegate(&cls)); | |
| } | |
| #[test] | |
| fn class_union_unicode() { | |
| let cls1 = uclass(&[('a', 'g'), ('m', 't'), ('A', 'C')]); | |
| let cls2 = uclass(&[('a', 'z')]); | |
| let expected = uclass(&[('a', 'z'), ('A', 'C')]); | |
| assert_eq!(expected, uunion(&cls1, &cls2)); | |
| } | |
| #[test] | |
| fn class_union_bytes() { | |
| let cls1 = bclass(&[(b'a', b'g'), (b'm', b't'), (b'A', b'C')]); | |
| let cls2 = bclass(&[(b'a', b'z')]); | |
| let expected = bclass(&[(b'a', b'z'), (b'A', b'C')]); | |
| assert_eq!(expected, bunion(&cls1, &cls2)); | |
| } | |
| #[test] | |
| fn class_intersect_unicode() { | |
| let cls1 = uclass(&[]); | |
| let cls2 = uclass(&[('a', 'a')]); | |
| let expected = uclass(&[]); | |
| assert_eq!(expected, uintersect(&cls1, &cls2)); | |
| let cls1 = uclass(&[('a', 'a')]); | |
| let cls2 = uclass(&[('a', 'a')]); | |
| let expected = uclass(&[('a', 'a')]); | |
| assert_eq!(expected, uintersect(&cls1, &cls2)); | |
| let cls1 = uclass(&[('a', 'a')]); | |
| let cls2 = uclass(&[('b', 'b')]); | |
| let expected = uclass(&[]); | |
| assert_eq!(expected, uintersect(&cls1, &cls2)); | |
| let cls1 = uclass(&[('a', 'a')]); | |
| let cls2 = uclass(&[('a', 'c')]); | |
| let expected = uclass(&[('a', 'a')]); | |
| assert_eq!(expected, uintersect(&cls1, &cls2)); | |
| let cls1 = uclass(&[('a', 'b')]); | |
| let cls2 = uclass(&[('a', 'c')]); | |
| let expected = uclass(&[('a', 'b')]); | |
| assert_eq!(expected, uintersect(&cls1, &cls2)); | |
| let cls1 = uclass(&[('a', 'b')]); | |
| let cls2 = uclass(&[('b', 'c')]); | |
| let expected = uclass(&[('b', 'b')]); | |
| assert_eq!(expected, uintersect(&cls1, &cls2)); | |
| let cls1 = uclass(&[('a', 'b')]); | |
| let cls2 = uclass(&[('c', 'd')]); | |
| let expected = uclass(&[]); | |
| assert_eq!(expected, uintersect(&cls1, &cls2)); | |
| let cls1 = uclass(&[('b', 'c')]); | |
| let cls2 = uclass(&[('a', 'd')]); | |
| let expected = uclass(&[('b', 'c')]); | |
| assert_eq!(expected, uintersect(&cls1, &cls2)); | |
| let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); | |
| let cls2 = uclass(&[('a', 'h')]); | |
| let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); | |
| assert_eq!(expected, uintersect(&cls1, &cls2)); | |
| let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); | |
| let cls2 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); | |
| let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); | |
| assert_eq!(expected, uintersect(&cls1, &cls2)); | |
| let cls1 = uclass(&[('a', 'b'), ('g', 'h')]); | |
| let cls2 = uclass(&[('d', 'e'), ('k', 'l')]); | |
| let expected = uclass(&[]); | |
| assert_eq!(expected, uintersect(&cls1, &cls2)); | |
| let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); | |
| let cls2 = uclass(&[('h', 'h')]); | |
| let expected = uclass(&[('h', 'h')]); | |
| assert_eq!(expected, uintersect(&cls1, &cls2)); | |
| let cls1 = uclass(&[('a', 'b'), ('e', 'f'), ('i', 'j')]); | |
| let cls2 = uclass(&[('c', 'd'), ('g', 'h'), ('k', 'l')]); | |
| let expected = uclass(&[]); | |
| assert_eq!(expected, uintersect(&cls1, &cls2)); | |
| let cls1 = uclass(&[('a', 'b'), ('c', 'd'), ('e', 'f')]); | |
| let cls2 = uclass(&[('b', 'c'), ('d', 'e'), ('f', 'g')]); | |
| let expected = uclass(&[('b', 'f')]); | |
| assert_eq!(expected, uintersect(&cls1, &cls2)); | |
| } | |
| #[test] | |
| fn class_intersect_bytes() { | |
| let cls1 = bclass(&[]); | |
| let cls2 = bclass(&[(b'a', b'a')]); | |
| let expected = bclass(&[]); | |
| assert_eq!(expected, bintersect(&cls1, &cls2)); | |
| let cls1 = bclass(&[(b'a', b'a')]); | |
| let cls2 = bclass(&[(b'a', b'a')]); | |
| let expected = bclass(&[(b'a', b'a')]); | |
| assert_eq!(expected, bintersect(&cls1, &cls2)); | |
| let cls1 = bclass(&[(b'a', b'a')]); | |
| let cls2 = bclass(&[(b'b', b'b')]); | |
| let expected = bclass(&[]); | |
| assert_eq!(expected, bintersect(&cls1, &cls2)); | |
| let cls1 = bclass(&[(b'a', b'a')]); | |
| let cls2 = bclass(&[(b'a', b'c')]); | |
| let expected = bclass(&[(b'a', b'a')]); | |
| assert_eq!(expected, bintersect(&cls1, &cls2)); | |
| let cls1 = bclass(&[(b'a', b'b')]); | |
| let cls2 = bclass(&[(b'a', b'c')]); | |
| let expected = bclass(&[(b'a', b'b')]); | |
| assert_eq!(expected, bintersect(&cls1, &cls2)); | |
| let cls1 = bclass(&[(b'a', b'b')]); | |
| let cls2 = bclass(&[(b'b', b'c')]); | |
| let expected = bclass(&[(b'b', b'b')]); | |
| assert_eq!(expected, bintersect(&cls1, &cls2)); | |
| let cls1 = bclass(&[(b'a', b'b')]); | |
| let cls2 = bclass(&[(b'c', b'd')]); | |
| let expected = bclass(&[]); | |
| assert_eq!(expected, bintersect(&cls1, &cls2)); | |
| let cls1 = bclass(&[(b'b', b'c')]); | |
| let cls2 = bclass(&[(b'a', b'd')]); | |
| let expected = bclass(&[(b'b', b'c')]); | |
| assert_eq!(expected, bintersect(&cls1, &cls2)); | |
| let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); | |
| let cls2 = bclass(&[(b'a', b'h')]); | |
| let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); | |
| assert_eq!(expected, bintersect(&cls1, &cls2)); | |
| let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); | |
| let cls2 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); | |
| let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); | |
| assert_eq!(expected, bintersect(&cls1, &cls2)); | |
| let cls1 = bclass(&[(b'a', b'b'), (b'g', b'h')]); | |
| let cls2 = bclass(&[(b'd', b'e'), (b'k', b'l')]); | |
| let expected = bclass(&[]); | |
| assert_eq!(expected, bintersect(&cls1, &cls2)); | |
| let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); | |
| let cls2 = bclass(&[(b'h', b'h')]); | |
| let expected = bclass(&[(b'h', b'h')]); | |
| assert_eq!(expected, bintersect(&cls1, &cls2)); | |
| let cls1 = bclass(&[(b'a', b'b'), (b'e', b'f'), (b'i', b'j')]); | |
| let cls2 = bclass(&[(b'c', b'd'), (b'g', b'h'), (b'k', b'l')]); | |
| let expected = bclass(&[]); | |
| assert_eq!(expected, bintersect(&cls1, &cls2)); | |
| let cls1 = bclass(&[(b'a', b'b'), (b'c', b'd'), (b'e', b'f')]); | |
| let cls2 = bclass(&[(b'b', b'c'), (b'd', b'e'), (b'f', b'g')]); | |
| let expected = bclass(&[(b'b', b'f')]); | |
| assert_eq!(expected, bintersect(&cls1, &cls2)); | |
| } | |
| #[test] | |
| fn class_difference_unicode() { | |
| let cls1 = uclass(&[('a', 'a')]); | |
| let cls2 = uclass(&[('a', 'a')]); | |
| let expected = uclass(&[]); | |
| assert_eq!(expected, udifference(&cls1, &cls2)); | |
| let cls1 = uclass(&[('a', 'a')]); | |
| let cls2 = uclass(&[]); | |
| let expected = uclass(&[('a', 'a')]); | |
| assert_eq!(expected, udifference(&cls1, &cls2)); | |
| let cls1 = uclass(&[]); | |
| let cls2 = uclass(&[('a', 'a')]); | |
| let expected = uclass(&[]); | |
| assert_eq!(expected, udifference(&cls1, &cls2)); | |
| let cls1 = uclass(&[('a', 'z')]); | |
| let cls2 = uclass(&[('a', 'a')]); | |
| let expected = uclass(&[('b', 'z')]); | |
| assert_eq!(expected, udifference(&cls1, &cls2)); | |
| let cls1 = uclass(&[('a', 'z')]); | |
| let cls2 = uclass(&[('z', 'z')]); | |
| let expected = uclass(&[('a', 'y')]); | |
| assert_eq!(expected, udifference(&cls1, &cls2)); | |
| let cls1 = uclass(&[('a', 'z')]); | |
| let cls2 = uclass(&[('m', 'm')]); | |
| let expected = uclass(&[('a', 'l'), ('n', 'z')]); | |
| assert_eq!(expected, udifference(&cls1, &cls2)); | |
| let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]); | |
| let cls2 = uclass(&[('a', 'z')]); | |
| let expected = uclass(&[]); | |
| assert_eq!(expected, udifference(&cls1, &cls2)); | |
| let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]); | |
| let cls2 = uclass(&[('d', 'v')]); | |
| let expected = uclass(&[('a', 'c')]); | |
| assert_eq!(expected, udifference(&cls1, &cls2)); | |
| let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]); | |
| let cls2 = uclass(&[('b', 'g'), ('s', 'u')]); | |
| let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]); | |
| assert_eq!(expected, udifference(&cls1, &cls2)); | |
| let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]); | |
| let cls2 = uclass(&[('b', 'd'), ('e', 'g'), ('s', 'u')]); | |
| let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]); | |
| assert_eq!(expected, udifference(&cls1, &cls2)); | |
| let cls1 = uclass(&[('x', 'z')]); | |
| let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]); | |
| let expected = uclass(&[('x', 'z')]); | |
| assert_eq!(expected, udifference(&cls1, &cls2)); | |
| let cls1 = uclass(&[('a', 'z')]); | |
| let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]); | |
| let expected = uclass(&[('d', 'd'), ('h', 'r'), ('v', 'z')]); | |
| assert_eq!(expected, udifference(&cls1, &cls2)); | |
| } | |
| #[test] | |
| fn class_difference_bytes() { | |
| let cls1 = bclass(&[(b'a', b'a')]); | |
| let cls2 = bclass(&[(b'a', b'a')]); | |
| let expected = bclass(&[]); | |
| assert_eq!(expected, bdifference(&cls1, &cls2)); | |
| let cls1 = bclass(&[(b'a', b'a')]); | |
| let cls2 = bclass(&[]); | |
| let expected = bclass(&[(b'a', b'a')]); | |
| assert_eq!(expected, bdifference(&cls1, &cls2)); | |
| let cls1 = bclass(&[]); | |
| let cls2 = bclass(&[(b'a', b'a')]); | |
| let expected = bclass(&[]); | |
| assert_eq!(expected, bdifference(&cls1, &cls2)); | |
| let cls1 = bclass(&[(b'a', b'z')]); | |
| let cls2 = bclass(&[(b'a', b'a')]); | |
| let expected = bclass(&[(b'b', b'z')]); | |
| assert_eq!(expected, bdifference(&cls1, &cls2)); | |
| let cls1 = bclass(&[(b'a', b'z')]); | |
| let cls2 = bclass(&[(b'z', b'z')]); | |
| let expected = bclass(&[(b'a', b'y')]); | |
| assert_eq!(expected, bdifference(&cls1, &cls2)); | |
| let cls1 = bclass(&[(b'a', b'z')]); | |
| let cls2 = bclass(&[(b'm', b'm')]); | |
| let expected = bclass(&[(b'a', b'l'), (b'n', b'z')]); | |
| assert_eq!(expected, bdifference(&cls1, &cls2)); | |
| let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]); | |
| let cls2 = bclass(&[(b'a', b'z')]); | |
| let expected = bclass(&[]); | |
| assert_eq!(expected, bdifference(&cls1, &cls2)); | |
| let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]); | |
| let cls2 = bclass(&[(b'd', b'v')]); | |
| let expected = bclass(&[(b'a', b'c')]); | |
| assert_eq!(expected, bdifference(&cls1, &cls2)); | |
| let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]); | |
| let cls2 = bclass(&[(b'b', b'g'), (b's', b'u')]); | |
| let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]); | |
| assert_eq!(expected, bdifference(&cls1, &cls2)); | |
| let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]); | |
| let cls2 = bclass(&[(b'b', b'd'), (b'e', b'g'), (b's', b'u')]); | |
| let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]); | |
| assert_eq!(expected, bdifference(&cls1, &cls2)); | |
| let cls1 = bclass(&[(b'x', b'z')]); | |
| let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]); | |
| let expected = bclass(&[(b'x', b'z')]); | |
| assert_eq!(expected, bdifference(&cls1, &cls2)); | |
| let cls1 = bclass(&[(b'a', b'z')]); | |
| let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]); | |
| let expected = bclass(&[(b'd', b'd'), (b'h', b'r'), (b'v', b'z')]); | |
| assert_eq!(expected, bdifference(&cls1, &cls2)); | |
| } | |
| #[test] | |
| fn class_symmetric_difference_unicode() { | |
| let cls1 = uclass(&[('a', 'm')]); | |
| let cls2 = uclass(&[('g', 't')]); | |
| let expected = uclass(&[('a', 'f'), ('n', 't')]); | |
| assert_eq!(expected, usymdifference(&cls1, &cls2)); | |
| } | |
| #[test] | |
| fn class_symmetric_difference_bytes() { | |
| let cls1 = bclass(&[(b'a', b'm')]); | |
| let cls2 = bclass(&[(b'g', b't')]); | |
| let expected = bclass(&[(b'a', b'f'), (b'n', b't')]); | |
| assert_eq!(expected, bsymdifference(&cls1, &cls2)); | |
| } | |
| #[test] | |
| #[should_panic] | |
| fn hir_byte_literal_non_ascii() { | |
| Hir::literal(Literal::Byte(b'a')); | |
| } | |
| // We use a thread with an explicit stack size to test that our destructor | |
| // for Hir can handle arbitrarily sized expressions in constant stack | |
| // space. In case we run on a platform without threads (WASM?), we limit | |
| // this test to Windows/Unix. | |
| #[test] | |
| #[cfg(any(unix, windows))] | |
| fn no_stack_overflow_on_drop() { | |
| use std::thread; | |
| let run = || { | |
| let mut expr = Hir::empty(); | |
| for _ in 0..100 { | |
| expr = Hir::group(Group { | |
| kind: GroupKind::NonCapturing, | |
| hir: Box::new(expr), | |
| }); | |
| expr = Hir::repetition(Repetition { | |
| kind: RepetitionKind::ZeroOrOne, | |
| greedy: true, | |
| hir: Box::new(expr), | |
| }); | |
| expr = Hir { | |
| kind: HirKind::Concat(vec![expr]), | |
| info: HirInfo::new(), | |
| }; | |
| expr = Hir { | |
| kind: HirKind::Alternation(vec![expr]), | |
| info: HirInfo::new(), | |
| }; | |
| } | |
| assert!(!expr.kind.is_empty()); | |
| }; | |
| // We run our test on a thread with a small stack size so we can | |
| // force the issue more easily. | |
| thread::Builder::new() | |
| .stack_size(1<<10) | |
| .spawn(run) | |
| .unwrap() | |
| .join() | |
| .unwrap(); | |
| } | |
| } |