syntax: remove all uses of 'as'

It turns out that all uses of 'as' in the regex-syntax crate can be replaced with either explicitly infallible routines (like 'u32::from(char)'), or with routines that will panic on failure. These panics are strictly better than truncating casts that might otherwise lead to subtle bugs in the context of this crate. (Namely, we never really care about the perf effects here, since regex parsing is just never a bottleneck.)
rust-lang · Aug 26, 2022 · b50054a · b50054a
1 parent 58dd511
commit b50054a
Show file tree

Hide file tree

Showing 9 changed files with 178 additions and 156 deletions.
diff --git a/regex-syntax/src/ast/mod.rs b/regex-syntax/src/ast/mod.rs
@@ -615,11 +615,12 @@ impl Literal {
     /// If this literal was written as a `\x` hex escape, then this returns
     /// the corresponding byte value. Otherwise, this returns `None`.
     pub fn byte(&self) -> Option<u8> {
-        let short_hex = LiteralKind::HexFixed(HexLiteralKind::X);
-        if self.c as u32 <= 255 && self.kind == short_hex {
-            Some(self.c as u8)
-        } else {
-            None
+        match self.kind {
+            LiteralKind::HexFixed(HexLiteralKind::X) => {
+                // MSRV(1.59): Use 'u8::try_from(self.c)' instead.
+                u8::try_from(u32::from(self.c)).ok()
+            }
+            _ => None,
         }
     }
 }

diff --git a/regex-syntax/src/ast/print.rs b/regex-syntax/src/ast/print.rs
@@ -213,24 +213,24 @@ impl<W: fmt::Write> Writer<W> {
         match ast.kind {
             Verbatim => self.wtr.write_char(ast.c),
             Punctuation => write!(self.wtr, r"\{}", ast.c),
-            Octal => write!(self.wtr, r"\{:o}", ast.c as u32),
+            Octal => write!(self.wtr, r"\{:o}", u32::from(ast.c)),
             HexFixed(ast::HexLiteralKind::X) => {
-                write!(self.wtr, r"\x{:02X}", ast.c as u32)
+                write!(self.wtr, r"\x{:02X}", u32::from(ast.c))
             }
             HexFixed(ast::HexLiteralKind::UnicodeShort) => {
-                write!(self.wtr, r"\u{:04X}", ast.c as u32)
+                write!(self.wtr, r"\u{:04X}", u32::from(ast.c))
             }
             HexFixed(ast::HexLiteralKind::UnicodeLong) => {
-                write!(self.wtr, r"\U{:08X}", ast.c as u32)
+                write!(self.wtr, r"\U{:08X}", u32::from(ast.c))
             }
             HexBrace(ast::HexLiteralKind::X) => {
-                write!(self.wtr, r"\x{{{:X}}}", ast.c as u32)
+                write!(self.wtr, r"\x{{{:X}}}", u32::from(ast.c))
             }
             HexBrace(ast::HexLiteralKind::UnicodeShort) => {
-                write!(self.wtr, r"\u{{{:X}}}", ast.c as u32)
+                write!(self.wtr, r"\u{{{:X}}}", u32::from(ast.c))
             }
             HexBrace(ast::HexLiteralKind::UnicodeLong) => {
-                write!(self.wtr, r"\U{{{:X}}}", ast.c as u32)
+                write!(self.wtr, r"\U{{{:X}}}", u32::from(ast.c))
             }
             Special(ast::SpecialLiteralKind::Bell) => {
                 self.wtr.write_str(r"\a")

diff --git a/regex-syntax/src/hir/interval.rs b/regex-syntax/src/hir/interval.rs
@@ -481,7 +481,7 @@ impl Bound for u8 {
         u8::MAX
     }
     fn as_u32(self) -> u32 {
-        self as u32
+        u32::from(self)
     }
     fn increment(self) -> Self {
         self.checked_add(1).unwrap()
@@ -499,20 +499,20 @@ impl Bound for char {
         '\u{10FFFF}'
     }
     fn as_u32(self) -> u32 {
-        self as u32
+        u32::from(self)
     }
 
     fn increment(self) -> Self {
         match self {
             '\u{D7FF}' => '\u{E000}',
-            c => char::from_u32((c as u32).checked_add(1).unwrap()).unwrap(),
+            c => char::from_u32(u32::from(c).checked_add(1).unwrap()).unwrap(),
         }
     }
 
     fn decrement(self) -> Self {
         match self {
             '\u{E000}' => '\u{D7FF}',
-            c => char::from_u32((c as u32).checked_sub(1).unwrap()).unwrap(),
+            c => char::from_u32(u32::from(c).checked_sub(1).unwrap()).unwrap(),
         }
     }
 }

diff --git a/regex-syntax/src/hir/literal/mod.rs b/regex-syntax/src/hir/literal/mod.rs
@@ -475,8 +475,8 @@ impl Literals {
             base = vec![Literal::empty()];
         }
         for r in cls.iter() {
-            let (s, e) = (r.start as u32, r.end as u32 + 1);
-            for c in (s..e).filter_map(char::from_u32) {
+            let (s, e) = (u32::from(r.start), u32::from(r.end));
+            for c in (s..=e).filter_map(char::from_u32) {
                 for mut lit in base.clone() {
                     let mut bytes = c.to_string().into_bytes();
                     if reverse {
@@ -502,8 +502,7 @@ impl Literals {
             base = vec![Literal::empty()];
         }
         for r in cls.iter() {
-            let (s, e) = (r.start as u32, r.end as u32 + 1);
-            for b in (s..e).map(|b| b as u8) {
+            for b in r.start..=r.end {
                 for mut lit in base.clone() {
                     lit.push(b);
                     self.lits.push(lit);
@@ -784,7 +783,10 @@ fn repeat_range_literals<F: FnMut(&Hir, &mut Literals)>(
     lits: &mut Literals,
     mut f: F,
 ) {
-    if min == 0 {
+    // If 'min' somehow overflows usize, then we just treat it as 0, which is
+    // the most conservative thing we can do.
+    let umin = usize::try_from(min).unwrap_or(0);
+    if umin == 0 {
         // This is a bit conservative. If `max` is set, then we could
         // treat this as a finite set of alternations. For now, we
         // just treat it as `e*`.
@@ -797,11 +799,11 @@ fn repeat_range_literals<F: FnMut(&Hir, &mut Literals)>(
             lits,
         );
     } else {
-        if min > 0 {
-            let n = cmp::min(lits.limit_size, min as usize);
+        if umin > 0 {
+            let n = cmp::min(lits.limit_size, umin);
             let es = iter::repeat(e.clone()).take(n).collect();
             f(&Hir::concat(es), lits);
-            if n < min as usize || lits.contains_empty() {
+            if n < umin || lits.contains_empty() {
                 lits.cut();
             }
         }
@@ -928,12 +930,13 @@ fn escape_unicode(bytes: &[u8]) -> String {
     let mut space_escaped = String::new();
     for c in show.chars() {
         if c.is_whitespace() {
-            let escaped = if c as u32 <= 0x7F {
-                escape_byte(c as u8)
-            } else if c as u32 <= 0xFFFF {
-                format!(r"\u{{{:04x}}}", c as u32)
+            let cp = u32::from(c);
+            let escaped = if cp <= 0x7F {
+                escape_byte(u8::try_from(cp).unwrap())
+            } else if cp <= 0xFFFF {
+                format!(r"\u{{{:04x}}}", cp)
             } else {
-                format!(r"\U{{{:08x}}}", c as u32)
+                format!(r"\U{{{:08x}}}", cp)
             };
             space_escaped.push_str(&escaped);
         } else {
@@ -959,13 +962,11 @@ fn escape_byte(byte: u8) -> String {
 }
 
 fn cls_char_count(cls: &hir::ClassUnicode) -> usize {
-    cls.iter().map(|&r| 1 + (r.end as u32) - (r.start as u32)).sum::<u32>()
-        as usize
+    cls.iter().map(|&r| r.len()).sum()
 }
 
 fn cls_byte_count(cls: &hir::ClassBytes) -> usize {
-    cls.iter().map(|&r| 1 + (r.end as u32) - (r.start as u32)).sum::<u32>()
-        as usize
+    cls.iter().map(|&r| r.len()).sum()
 }
 
 #[cfg(test)]

diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs
@@ -90,6 +90,13 @@ pub enum ErrorKind {
     __Nonexhaustive,
 }
 
+// BREADCRUMBS:
+//
+// Remove EmptyClassNotAllowed
+// Make errors non_exhaustive
+// Simplify repetitions (get rid of ZeroOrOne, OneOrMore etc)
+// Get rid of deprecated things
+
 impl ErrorKind {
     // TODO: Remove this method entirely on the next breaking semver release.
     #[allow(deprecated)]
@@ -1013,12 +1020,12 @@ impl fmt::Debug for ClassUnicodeRange {
         {
             self.start.to_string()
         } else {
-            format!("0x{:X}", self.start as u32)
+            format!("0x{:X}", u32::from(self.start))
         };
         let end = if !self.end.is_whitespace() && !self.end.is_control() {
             self.end.to_string()
         } else {
-            format!("0x{:X}", self.end as u32)
+            format!("0x{:X}", u32::from(self.end))
         };
         f.debug_struct("ClassUnicodeRange")
             .field("start", &start)
@@ -1058,10 +1065,9 @@ impl Interval for ClassUnicodeRange {
         if !unicode::contains_simple_case_mapping(self.start, self.end)? {
             return Ok(());
         }
-        let start = self.start as u32;
-        let end = (self.end as u32).saturating_add(1);
+        let (start, end) = (u32::from(self.start), u32::from(self.end));
         let mut next_simple_cp = None;
-        for cp in (start..end).filter_map(char::from_u32) {
+        for cp in (start..=end).filter_map(char::from_u32) {
             if next_simple_cp.map_or(false, |next| cp < next) {
                 continue;
             }
@@ -1104,6 +1110,18 @@ impl ClassUnicodeRange {
     pub fn end(&self) -> char {
         self.end
     }
+
+    /// Returns the number of codepoints in this range.
+    pub fn len(&self) -> usize {
+        let diff = 1 + u32::from(self.end) - u32::from(self.start);
+        // This is likely to panic in 16-bit targets since a usize can only fit
+        // 2^16. It's not clear what to do here, other than to return an error
+        // when building a Unicode class that contains a range whose length
+        // overflows usize. (Which, to be honest, is probably quite common on
+        // 16-bit targets. For example, this would imply that '.' and '\p{any}'
+        // would be impossible to build.)
+        usize::try_from(diff).expect("char class len fits in usize")
+    }
 }
 
 /// A set of characters represented by arbitrary bytes (where one byte
@@ -1291,18 +1309,27 @@ impl ClassBytesRange {
     pub fn end(&self) -> u8 {
         self.end
     }
+
+    /// Returns the number of bytes in this range.
+    pub fn len(&self) -> usize {
+        usize::from(self.end.checked_sub(self.start).unwrap())
+            .checked_add(1)
+            .unwrap()
+    }
 }
 
 impl fmt::Debug for ClassBytesRange {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         let mut debug = f.debug_struct("ClassBytesRange");
         if self.start <= 0x7F {
-            debug.field("start", &(self.start as char));
+            let ch = char::try_from(self.start).unwrap();
+            debug.field("start", &ch);
         } else {
             debug.field("start", &self.start);
         }
         if self.end <= 0x7F {
-            debug.field("end", &(self.end as char));
+            let ch = char::try_from(self.start).unwrap();
+            debug.field("end", &ch);
         } else {
             debug.field("end", &self.end);
         }

diff --git a/regex-syntax/src/hir/print.rs b/regex-syntax/src/hir/print.rs
@@ -217,18 +217,16 @@ impl<W: fmt::Write> Writer<W> {
     }
 
     fn write_literal_byte(&mut self, b: u8) -> fmt::Result {
-        let c = b as char;
-        if c <= 0x7F as char && !c.is_control() && !c.is_whitespace() {
-            self.write_literal_char(c)
+        if b <= 0x7F && !b.is_ascii_control() && !b.is_ascii_whitespace() {
+            self.write_literal_char(char::try_from(b).unwrap())
         } else {
             write!(self.wtr, "(?-u:\\x{:02X})", b)
         }
     }
 
     fn write_literal_class_byte(&mut self, b: u8) -> fmt::Result {
-        let c = b as char;
-        if c <= 0x7F as char && !c.is_control() && !c.is_whitespace() {
-            self.write_literal_char(c)
+        if b <= 0x7F && !b.is_ascii_control() && !b.is_ascii_whitespace() {
+            self.write_literal_char(char::try_from(b).unwrap())
         } else {
             write!(self.wtr, "\\x{:02X}", b)
         }