Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement fragment parsing #91

Merged
merged 3 commits into from Feb 19, 2015
Merged
Changes from all commits
Commits
File filter...
Filter file types
Jump to…
Jump to file
Failed to load files.

Always

Just for now

@@ -33,3 +33,23 @@ tb: tests2.dat-44
tb: tests26.dat-9
tb: tests5.dat-16
tb: webkit02.dat-2
tb: foreign-fragment.dat-0
tb: foreign-fragment.dat-1
tb: foreign-fragment.dat-18
tb: foreign-fragment.dat-19
tb: foreign-fragment.dat-2
tb: foreign-fragment.dat-22
tb: foreign-fragment.dat-23
tb: foreign-fragment.dat-26
tb: foreign-fragment.dat-27
tb: foreign-fragment.dat-3
tb: foreign-fragment.dat-30
tb: foreign-fragment.dat-31
tb: foreign-fragment.dat-34
tb: foreign-fragment.dat-35
tb: foreign-fragment.dat-38
tb: foreign-fragment.dat-39
tb: foreign-fragment.dat-40
tb: foreign-fragment.dat-41
tb: foreign-fragment.dat-47
tb: foreign-fragment.dat-48
@@ -18,6 +18,8 @@ use core::default::Default;
use core::option;
use collections::string::String;

use string_cache::{Atom, QualName};

/// Convenience function to turn a single `String` into an iterator.
pub fn one_input(x: String) -> option::IntoIter<String> {
Some(x).into_iter()
@@ -82,6 +84,38 @@ pub fn parse_to<
tok.unwrap().unwrap()
}

/// Parse an HTML fragment and send results to a `TreeSink`.
///
/// ## Example
///
/// ```ignore
/// let mut sink = MySink;
/// parse_fragment_to(&mut sink, one_input(my_str), context_token, Default::default());
/// ```
pub fn parse_fragment_to<
Sink: TreeSink,
It: Iterator<Item=String>
>(
sink: Sink,
input: It,
context: Atom,
opts: ParseOpts) -> Sink {

let mut sink = sink;
let context_elem = sink.create_element(QualName::new(ns!(HTML), context), vec!());
let tb = TreeBuilder::new_for_fragment(sink, context_elem, None, opts.tree_builder);
let tok_opts = TokenizerOpts {
initial_state: Some(tb.tokenizer_state_for_context_elem()),
.. opts.tokenizer
};
let mut tok = Tokenizer::new(tb, tok_opts);
for s in input {
tok.feed(s);
}
tok.end();
tok.unwrap().unwrap()
}

/// Results which can be extracted from a `TreeSink`.
///
/// Implement this for your parse tree data type so that it
@@ -105,3 +139,18 @@ pub fn parse<Output, It>(input: It, opts: ParseOpts) -> Output
let sink = parse_to(Default::default(), input, opts);
ParseResult::get_result(sink)
}

/// Parse an HTML fragment into a type which implements `ParseResult`.
///
/// ## Example
///
/// ```ignore
/// let dom: RcDom = parse_fragment(one_input(my_str), context_token, Default::default());
/// ```
pub fn parse_fragment<Output, It>(input: It, context: Atom, opts: ParseOpts) -> Output
where Output: ParseResult,
It: Iterator<Item=String>,
{
let sink = parse_fragment_to(Default::default(), input, context, opts);
ParseResult::get_result(sink)
}
@@ -49,7 +49,7 @@ extern crate phf;
extern crate time;

pub use tokenizer::Attribute;
pub use driver::{one_input, ParseOpts, parse_to, parse};
pub use driver::{one_input, ParseOpts, parse_to, parse_fragment_to, parse, parse_fragment};

#[cfg(not(for_c))]
pub use serialize::serialize;
@@ -665,14 +665,17 @@ impl<Handle, Sink> TreeBuilderActions<Handle>
}
}

// https://html.spec.whatwg.org/multipage/syntax.html#reset-the-insertion-mode-appropriately
fn reset_insertion_mode(&mut self) -> InsertionMode {
for (i, node) in self.open_elems.iter().enumerate().rev() {
for (i, mut node) in self.open_elems.iter().enumerate().rev() {
let last = i == 0u;
if let (true, Some(ctx)) = (last, self.context_elem.as_ref()) {
node = ctx;
}
let name = match self.sink.elem_name(node.clone()) {
QualName { ns: ns!(HTML), local } => local,
_ => continue,
};
let last = i == 0u;
// FIXME: fragment case context element
match name {
// FIXME: <select> sub-steps
atom!(select) => return InSelect,
@@ -19,9 +19,12 @@ use self::types::*;
use self::actions::TreeBuilderActions;
use self::rules::TreeBuilderStep;

use string_cache::QualName;

use tokenizer;
use tokenizer::{Doctype, Tag};
use tokenizer::TokenSink;
use tokenizer::states as tok_state;

use util::str::{is_ascii_whitespace, char_run};

@@ -33,7 +36,8 @@ use std::borrow::Cow::Borrowed;
use collections::RingBuf;

#[macro_use] mod tag_sets;
mod interface;
// "pub" is a workaround for rust#18241 (?)
pub mod interface;
mod data;
mod types;
mod actions;
@@ -52,9 +56,6 @@ pub struct TreeBuilderOpts {
/// Is this an `iframe srcdoc` document?
pub iframe_srcdoc: bool,

/// Are we parsing a HTML fragment?
pub fragment: bool,

/// Should we drop the DOCTYPE (if any) from the tree?
pub drop_doctype: bool,

@@ -75,7 +76,6 @@ impl Default for TreeBuilderOpts {
exact_errors: false,
scripting_enabled: true,
iframe_srcdoc: false,
fragment: false,
drop_doctype: false,
ignore_missing_rules: false,
}
@@ -135,6 +135,9 @@ pub struct TreeBuilder<Handle, Sink> {
/// Is foster parenting enabled?
foster_parenting: bool,

/// The context element for the fragment parsing algorithm.
context_elem: Option<Handle>,

// WARNING: If you add new fields that contain Handles, you
// must add them to trace_handles() below to preserve memory
// safety!
@@ -168,6 +171,77 @@ impl<Handle, Sink> TreeBuilder<Handle, Sink>
frameset_ok: true,
ignore_lf: false,
foster_parenting: false,
context_elem: None,
}
}

/// Create a new tree builder which sends tree modifications to a particular `TreeSink`.
/// This is for parsing fragments.
///
/// The tree builder is also a `TokenSink`.
pub fn new_for_fragment(mut sink: Sink,
context_elem: Handle,
form_elem: Option<Handle>,
opts: TreeBuilderOpts) -> TreeBuilder<Handle, Sink> {
let doc_handle = sink.get_document();
let context_is_template =
sink.elem_name(context_elem.clone()) == qualname!(HTML, template);
let mut tb = TreeBuilder {
opts: opts,
sink: sink,
mode: Initial,
orig_mode: None,
template_modes: if context_is_template { vec![InTemplate] } else { vec![] },
pending_table_text: vec!(),
quirks_mode: NoQuirks, // FIXME(#96) set this to match the sink's document
doc_handle: doc_handle,
open_elems: vec!(),
active_formatting: vec!(),
head_elem: None,
form_elem: form_elem,
next_tokenizer_state: None,
frameset_ok: true,
ignore_lf: false,
foster_parenting: false,
context_elem: Some(context_elem),
};

// https://html.spec.whatwg.org/multipage/syntax.html#parsing-html-fragments
// 5. Let root be a new html element with no attributes.
// 6. Append the element root to the Document node created above.
// 7. Set up the parser's stack of open elements so that it contains just the single element root.
tb.create_root(vec!());
// 10. Reset the parser's insertion mode appropriately.
tb.mode = tb.reset_insertion_mode();

tb
}

// https://html.spec.whatwg.org/multipage/syntax.html#concept-frag-parse-context
// Step 4. Set the state of the HTML parser's tokenization stage as follows:
pub fn tokenizer_state_for_context_elem(&self) -> tok_state::State {
let elem = self.context_elem.clone().expect("no context element");
let name = match self.sink.elem_name(elem) {
QualName { ns: ns!(HTML), local } => local,
_ => return tok_state::Data
};
match name {
atom!(title) | atom!(textarea) => tok_state::RawData(tok_state::Rcdata),

atom!(style) | atom!(xmp) | atom!(iframe)
| atom!(noembed) | atom!(noframes) => tok_state::RawData(tok_state::Rawtext),

atom!(script) => tok_state::RawData(tok_state::ScriptData),

atom!(noscript) => if self.opts.scripting_enabled {
tok_state::RawData(tok_state::Rawtext)
} else {
tok_state::Data
},

atom!(plaintext) => tok_state::Plaintext,

_ => tok_state::Data
}
}

@@ -198,6 +272,7 @@ impl<Handle, Sink> TreeBuilder<Handle, Sink>
}
self.head_elem.as_ref().map(|h| tracer.trace_handle(h.clone()));
self.form_elem.as_ref().map(|h| tracer.trace_handle(h.clone()));
self.context_elem.as_ref().map(|h| tracer.trace_handle(h.clone()));
}

// Debug helper
@@ -285,6 +360,11 @@ impl<Handle, Sink> TreeBuilder<Handle, Sink>
}
}
}

/// Are we parsing a HTML fragment?
pub fn is_fragment(&self) -> bool {
self.context_elem.is_some()
}
}

impl<Handle, Sink> TokenSink
@@ -136,7 +136,7 @@ impl<Handle, Sink> TreeBuilderStep<Handle>

tag @ <script> => {
let elem = self.sink.create_element(qualname!(HTML, script), tag.attrs);
if self.opts.fragment {
if self.is_fragment() {
self.sink.mark_script_already_started(elem.clone());
}
self.insert_appropriately(AppendNode(elem.clone()), None);
@@ -1204,7 +1204,7 @@ impl<Handle, Sink> TreeBuilderStep<Handle>
<html> => self.step(InBody, token),

</html> => {
if self.opts.fragment {
if self.is_fragment() {
self.unexpected(&token);
} else {
self.mode = AfterAfterBody;
@@ -1238,7 +1238,7 @@ impl<Handle, Sink> TreeBuilderStep<Handle>
self.unexpected(&token);
} else {
self.pop();
if !self.opts.fragment && !self.current_node_named(atom!(frameset)) {
if !self.is_fragment() && !self.current_node_named(atom!(frameset)) {
self.mode = AfterFrameset;
}
}
@@ -32,7 +32,9 @@ use test::ShouldFail::No;

use html5ever::sink::common::{Document, Doctype, Text, Comment, Element};
use html5ever::sink::rcdom::{RcDom, Handle};
use html5ever::{parse, one_input};
use html5ever::{parse, parse_fragment, one_input};

use string_cache::Atom;

fn parse_tests<It: Iterator<Item=String>>(mut lines: It) -> Vec<HashMap<String, String>> {
let mut tests = vec!();
@@ -145,13 +147,10 @@ fn make_test(
field.as_slice().trim_right_matches('\n').to_string()
};

if fields.get("document-fragment").is_some() {
// FIXME
return;
}

let data = get_field("data");
let expected = get_field("document");
let context = fields.get("document-fragment")
.map(|field| Atom::from_slice(field.as_slice().trim_right_matches('\n')));
let name = format!("tb: {}-{}", path_str, idx);
let ignore = ignores.contains(&name)
|| IGNORE_SUBSTRS.iter().any(|&ig| data.as_slice().contains(ig));
@@ -163,12 +162,27 @@ fn make_test(
should_fail: No,
},
testfn: DynTestFn(Thunk::new(move || {
let dom: RcDom = parse(one_input(data.clone()), Default::default());

let mut result = String::new();
for child in dom.document.borrow().children.iter() {
serialize(&mut result, 1, child.clone());
}
match context {
None => {
let dom: RcDom = parse(one_input(data.clone()), Default::default());
for child in dom.document.borrow().children.iter() {
serialize(&mut result, 1, child.clone());
}
},
Some(context) => {
let dom: RcDom = parse_fragment(one_input(data.clone()),
context,
Default::default());
// fragment case: serialize children of the html element
// rather than children of the document
let doc = dom.document.borrow();
let root = doc.children[0].borrow();
for child in root.children.iter() {
serialize(&mut result, 1, child.clone());
}
},
};
let len = result.len();
result.truncate(len - 1); // drop the trailing newline

ProTip! Use n and p to navigate between commits in a pull request.
You can’t perform that action at this time.