diff --git a/src/librustdoc/lib.rs b/src/librustdoc/lib.rs index 28dbd8ba7d3ac..62e1ad2444d05 100644 --- a/src/librustdoc/lib.rs +++ b/src/librustdoc/lib.rs @@ -11,6 +11,7 @@ #![feature(file_buffered)] #![feature(format_args_nl)] #![feature(if_let_guard)] +#![feature(iter_advance_by)] #![feature(iter_intersperse)] #![feature(round_char_boundary)] #![feature(rustc_private)] diff --git a/src/librustdoc/passes/lint/html_tags.rs b/src/librustdoc/passes/lint/html_tags.rs index 19cf15d40a3b4..da09117b1bba7 100644 --- a/src/librustdoc/passes/lint/html_tags.rs +++ b/src/librustdoc/passes/lint/html_tags.rs @@ -1,9 +1,11 @@ //! Detects invalid HTML (like an unclosed ``) in doc comments. +use std::borrow::Cow; use std::iter::Peekable; use std::ops::Range; use std::str::CharIndices; +use itertools::Itertools as _; use pulldown_cmark::{BrokenLink, Event, LinkType, Parser, Tag, TagEnd}; use rustc_hir::HirId; use rustc_resolve::rustdoc::source_span_for_markdown_range; @@ -101,7 +103,7 @@ pub(crate) fn visit_item(cx: &DocContext<'_>, item: &Item, hir_id: HirId, dox: & }); }; - let mut tags = Vec::new(); + let mut tagp = TagParser::new(); let mut is_in_comment = None; let mut in_code_block = false; @@ -126,70 +128,65 @@ pub(crate) fn visit_item(cx: &DocContext<'_>, item: &Item, hir_id: HirId, dox: & }; let p = Parser::new_with_broken_link_callback(dox, main_body_opts(), Some(&mut replacer)) - .into_offset_iter(); + .into_offset_iter() + .coalesce(|a, b| { + // for some reason, pulldown-cmark splits html blocks into separate events for each line. + // we undo this, in order to handle multi-line tags. + match (a, b) { + ((Event::Html(_), ra), (Event::Html(_), rb)) if ra.end == rb.start => { + let merged = ra.start..rb.end; + Ok((Event::Html(Cow::Borrowed(&dox[merged.clone()]).into()), merged)) + } + x => Err(x), + } + }); for (event, range) in p { match event { Event::Start(Tag::CodeBlock(_)) => in_code_block = true, Event::Html(text) | Event::InlineHtml(text) if !in_code_block => { - extract_tags(&mut tags, &text, range, &mut is_in_comment, &report_diag) + tagp.extract_tags(&text, range, &mut is_in_comment, &report_diag) } Event::End(TagEnd::CodeBlock) => in_code_block = false, _ => {} } } - for (tag, range) in tags.iter().filter(|(t, _)| { - let t = t.to_lowercase(); - !ALLOWED_UNCLOSED.contains(&t.as_str()) - }) { - report_diag(format!("unclosed HTML tag `{tag}`"), range, true); - } - if let Some(range) = is_in_comment { report_diag("Unclosed HTML comment".to_string(), &range, false); + } else if let &Some(quote_pos) = &tagp.quote_pos { + let qr = Range { start: quote_pos, end: quote_pos }; + report_diag( + format!("unclosed quoted HTML attribute on tag `{}`", &tagp.tag_name), + &qr, + false, + ); + } else { + if !tagp.tag_name.is_empty() { + report_diag( + format!("incomplete HTML tag `{}`", &tagp.tag_name), + &(tagp.tag_start_pos..dox.len()), + false, + ); + } + for (tag, range) in tagp.tags.iter().filter(|(t, _)| { + let t = t.to_lowercase(); + !is_implicitly_self_closing(&t) + }) { + report_diag(format!("unclosed HTML tag `{tag}`"), range, true); + } } } +/// These tags are interpreted as self-closing if they lack an explicit closing tag. const ALLOWED_UNCLOSED: &[&str] = &[ "area", "base", "br", "col", "embed", "hr", "img", "input", "keygen", "link", "meta", "param", "source", "track", "wbr", ]; -fn drop_tag( - tags: &mut Vec<(String, Range)>, - tag_name: String, - range: Range, - f: &impl Fn(String, &Range, bool), -) { - let tag_name_low = tag_name.to_lowercase(); - if let Some(pos) = tags.iter().rposition(|(t, _)| t.to_lowercase() == tag_name_low) { - // If the tag is nested inside a "` (the `h2` tag isn't required - // but it helps for the visualization). - f(format!("unopened HTML tag `{tag_name}`"), &range, false); - } +/// Allows constructs like ``, but not ` bool { + ALLOWED_UNCLOSED.contains(&tag_name) } fn extract_path_backwards(text: &str, end_pos: usize) -> Option { @@ -252,151 +249,292 @@ fn is_valid_for_html_tag_name(c: char, is_empty: bool) -> bool { c.is_ascii_alphabetic() || !is_empty && (c == '-' || c.is_ascii_digit()) } -fn extract_html_tag( - tags: &mut Vec<(String, Range)>, - text: &str, - range: &Range, - start_pos: usize, - iter: &mut Peekable>, - f: &impl Fn(String, &Range, bool), -) { - let mut tag_name = String::new(); - let mut is_closing = false; - let mut prev_pos = start_pos; +/// Parse html tags to ensure they are well-formed +#[derive(Debug, Clone)] +struct TagParser { + tags: Vec<(String, Range)>, + /// Name of the tag that is being parsed, if we are within a tag. + /// + /// Since the `<` and name of a tag must appear on the same line with no whitespace, + /// if this is the empty string, we are not in a tag. + tag_name: String, + tag_start_pos: usize, + is_closing: bool, + /// `true` if we are within a tag, but not within its name. + in_attrs: bool, + /// If we are in a quoted attribute, what quote char does it use? + /// + /// This needs to be stored in the struct since HTML5 allows newlines in quoted attrs. + quote: Option, + quote_pos: Option, + after_eq: bool, +} - loop { - let (pos, c) = match iter.peek() { - Some((pos, c)) => (*pos, *c), - // In case we reached the of the doc comment, we want to check that it's an - // unclosed HTML tag. For example "/// (prev_pos, '\0'), - }; - prev_pos = pos; - // Checking if this is a closing tag (like `` for ``). - if c == '/' && tag_name.is_empty() { - is_closing = true; - } else if is_valid_for_html_tag_name(c, tag_name.is_empty()) { - tag_name.push(c); - } else { - if !tag_name.is_empty() { - let mut r = Range { start: range.start + start_pos, end: range.start + pos }; - if c == '>' { - // In case we have a tag without attribute, we can consider the span to - // refer to it fully. - r.end += 1; +impl TagParser { + fn new() -> Self { + Self { + tags: Vec::new(), + tag_name: String::with_capacity(8), + tag_start_pos: 0, + is_closing: false, + in_attrs: false, + quote: None, + quote_pos: None, + after_eq: false, + } + } + + fn drop_tag(&mut self, range: Range, f: &impl Fn(String, &Range, bool)) { + let tag_name_low = self.tag_name.to_lowercase(); + if let Some(pos) = self.tags.iter().rposition(|(t, _)| t.to_lowercase() == tag_name_low) { + // If the tag is nested inside a "` (the `h2` tag isn't required + // but it helps for the visualization). + f(format!("unopened HTML tag `{}`", &self.tag_name), &range, false); + } + } + + /// Handle a `<` that appeared while parsing a tag. + fn handle_lt_in_tag( + &mut self, + range: Range, + lt_pos: usize, + f: &impl Fn(String, &Range, bool), + ) { + let global_pos = range.start + lt_pos; + // is this check needed? + if global_pos == self.tag_start_pos { + // `<` is in the tag because it is the start. + return; + } + // tried to start a new tag while in a tag + f( + format!("incomplete HTML tag `{}`", &self.tag_name), + &(self.tag_start_pos..global_pos), + false, + ); + self.tag_parsed(); + } + + fn extract_html_tag( + &mut self, + text: &str, + range: &Range, + start_pos: usize, + iter: &mut Peekable>, + f: &impl Fn(String, &Range, bool), + ) { + let mut prev_pos = start_pos; + + 'outer_loop: loop { + let (pos, c) = match iter.peek() { + Some((pos, c)) => (*pos, *c), + // In case we reached the of the doc comment, we want to check that it's an + // unclosed HTML tag. For example "/// (prev_pos, '\0'), + None => break, + }; + prev_pos = pos; + if c == '/' && self.tag_name.is_empty() { + // Checking if this is a closing tag (like `` for ``). + self.is_closing = true; + } else if !self.in_attrs && is_valid_for_html_tag_name(c, self.tag_name.is_empty()) { + self.tag_name.push(c); + } else { + if !self.tag_name.is_empty() { + self.in_attrs = true; + let mut r = Range { start: range.start + start_pos, end: range.start + pos }; + if c == '>' { + // In case we have a tag without attribute, we can consider the span to + // refer to it fully. + r.end += 1; + } + if self.is_closing { + // In case we have "" or even "". + if c != '>' { if !c.is_whitespace() { - if c == '>' { - r.end = range.start + new_pos + 1; - found = true; - } + // It seems like it's not a valid HTML tag. break; } - } - if !found { - break; - } - } - drop_tag(tags, tag_name, r, f); - } else { - let mut is_self_closing = false; - let mut quote_pos = None; - if c != '>' { - let mut quote = None; - let mut after_eq = false; - for (i, c) in text[pos..].char_indices() { - if !c.is_whitespace() { - if let Some(q) = quote { - if c == q { - quote = None; - quote_pos = None; - after_eq = false; + let mut found = false; + for (new_pos, c) in text[pos..].char_indices() { + if !c.is_whitespace() { + if c == '>' { + r.end = range.start + new_pos + 1; + found = true; + } else if c == '<' { + self.handle_lt_in_tag(range.clone(), pos + new_pos, f); } - } else if c == '>' { break; - } else if c == '/' && !after_eq { - is_self_closing = true; - } else { - if is_self_closing { - is_self_closing = false; - } - if (c == '"' || c == '\'') && after_eq { - quote = Some(c); - quote_pos = Some(pos + i); - } else if c == '=' { - after_eq = true; - } } - } else if quote.is_none() { - after_eq = false; + } + if !found { + break 'outer_loop; } } - } - if let Some(quote_pos) = quote_pos { - let qr = Range { start: quote_pos, end: quote_pos }; - f( - format!("unclosed quoted HTML attribute on tag `{tag_name}`"), - &qr, - false, - ); - } - if is_self_closing { - // https://html.spec.whatwg.org/#parse-error-non-void-html-element-start-tag-with-trailing-solidus - let valid = ALLOWED_UNCLOSED.contains(&&tag_name[..]) - || tags.iter().take(pos + 1).any(|(at, _)| { - let at = at.to_lowercase(); - at == "svg" || at == "math" - }); - if !valid { - f(format!("invalid self-closing HTML tag `{tag_name}`"), &r, false); - } + self.drop_tag(r, f); + self.tag_parsed(); } else { - tags.push((tag_name, r)); + self.extract_opening_tag(text, range, r, pos, c, iter, f) } } + break; } - break; + iter.next(); } - iter.next(); } -} - -fn extract_tags( - tags: &mut Vec<(String, Range)>, - text: &str, - range: Range, - is_in_comment: &mut Option>, - f: &impl Fn(String, &Range, bool), -) { - let mut iter = text.char_indices().peekable(); - while let Some((start_pos, c)) = iter.next() { - if is_in_comment.is_some() { - if text[start_pos..].starts_with("-->") { - *is_in_comment = None; + fn extract_opening_tag( + &mut self, + text: &str, + range: &Range, + r: Range, + pos: usize, + c: char, + iter: &mut Peekable>, + f: &impl Fn(String, &Range, bool), + ) { + // we can store this as a local, since html5 does require the `/` and `>` + // to not be separated by whitespace. + let mut is_self_closing = false; + if c != '>' { + 'parse_til_gt: { + for (i, c) in text[pos..].char_indices() { + if !c.is_whitespace() { + debug_assert_eq!(self.quote_pos.is_some(), self.quote.is_some()); + if let Some(q) = self.quote { + if c == q { + self.quote = None; + self.quote_pos = None; + self.after_eq = false; + } + } else if c == '>' { + break 'parse_til_gt; + } else if c == '<' { + self.handle_lt_in_tag(range.clone(), pos + i, f); + } else if c == '/' && !self.after_eq { + is_self_closing = true; + } else { + if is_self_closing { + is_self_closing = false; + } + if (c == '"' || c == '\'') && self.after_eq { + self.quote = Some(c); + self.quote_pos = Some(pos + i); + } else if c == '=' { + self.after_eq = true; + } + } + } else if self.quote.is_none() { + self.after_eq = false; + } + if !is_self_closing && !self.tag_name.is_empty() { + iter.next(); + } + } + // if we've run out of text but still haven't found a `>`, + // return early without calling `tag_parsed` or emitting lints. + // this allows us to either find the `>` in a later event + // or emit a lint about it being missing. + return; } - } else if c == '<' { - if text[start_pos..].starts_with("") { + *is_in_comment = None; + } + } else if c == '<' { + // " @@ -105,7 +106,7 @@ pub fn j() {} /// uiapp.run(&env::args().collect::>()); /// ``` /// -/// shouldn't warn! +// shouldn't warn! /// `````` pub fn k() {} @@ -121,3 +122,92 @@ pub fn no_error_1() {} /// backslashed \< //~^ ERROR unclosed HTML tag `a` pub fn p() {} + +/// +/// +/// +pub fn no_error_2() {} + +///
+/// +///
+pub fn no_error_3() {} + +/// >
class="foo"> +/// >
+pub fn no_error_4() {} + +/// unfinished ALLOWED_UNCLOSED +/// +/// note: CommonMark doesn't allow an html block to start with a multiline tag, +/// so we use `
` a bunch to force these to be parsed as html blocks. +/// +///
+/// +//~^ ERROR incomplete HTML tag `img` +pub fn r() {} + +/// >
+/// > href="#broken" +pub fn s() {} + +///
+/// +//~^ ERROR incomplete HTML tag `br` +pub fn t() {} + +///
+///
html5 allows this
+pub fn no_error_5() {} + +///
+/// +pub fn no_error_6() {} + +///
+/// what +pub fn no_error_7() {} + +/// Technically this is allowed per the html5 spec, +/// but there's basically no legitemate reason to do it, +/// so we don't allow it. +/// +///

foobar

+//~^ ERROR Unclosed HTML comment +//~| ERROR incomplete HTML tag `p` +pub fn v() {} diff --git a/tests/rustdoc-ui/lints/invalid-html-tags.stderr b/tests/rustdoc-ui/lints/invalid-html-tags.stderr index 9c2bfcf2c3dd7..b6ec22c247901 100644 --- a/tests/rustdoc-ui/lints/invalid-html-tags.stderr +++ b/tests/rustdoc-ui/lints/invalid-html-tags.stderr @@ -52,6 +52,12 @@ error: unclosed HTML tag `p` LL | ///

| ^^^ +error: incomplete HTML tag `script` + --> $DIR/invalid-html-tags.rs:45:5 + | +LL | ///