From 80121b971e0dfe96ccee7fe0337fdd870da4525d Mon Sep 17 00:00:00 2001 From: Or Gany Date: Mon, 21 Aug 2023 08:32:11 +0300 Subject: [PATCH] Adding comments for arena --- html5ever/examples/arena.rs | 35 ++++++++++++++++++------ html5ever/examples/noop-tokenize.rs | 9 ++++-- html5ever/examples/noop-tree-builder.rs | 8 ++++++ html5ever/examples/print-tree-actions.rs | 3 ++ html5ever/examples/tokenize.rs | 7 +++++ 5 files changed, 51 insertions(+), 11 deletions(-) diff --git a/html5ever/examples/arena.rs b/html5ever/examples/arena.rs index d084e011..acc05705 100644 --- a/html5ever/examples/arena.rs +++ b/html5ever/examples/arena.rs @@ -19,36 +19,32 @@ use std::collections::HashSet; use std::io::{self, Read}; use std::ptr; -fn main() { - let mut bytes = Vec::new(); - io::stdin().read_to_end(&mut bytes).unwrap(); - let arena = typed_arena::Arena::new(); - html5ever_parse_slice_into_arena(&bytes, &arena); -} - +/// By using our Sink type, the arena is filled with parsed HTML. fn html5ever_parse_slice_into_arena<'a>(bytes: &[u8], arena: Arena<'a>) -> Ref<'a> { let sink = Sink { arena, document: arena.alloc(Node::new(NodeData::Document)), quirks_mode: QuirksMode::NoQuirks, }; + parse_document(sink, Default::default()) .from_utf8() .one(bytes) } type Arena<'arena> = &'arena typed_arena::Arena>; - type Ref<'arena> = &'arena Node<'arena>; - type Link<'arena> = Cell>>; +/// Sink struct is responsible for handling how the data that comes out of the HTML parsing +/// unit (TreeBuilder in our case) is handled. struct Sink<'arena> { arena: Arena<'arena>, document: Ref<'arena>, quirks_mode: QuirksMode, } +/// DOM node which contains links to other nodes in the tree. pub struct Node<'arena> { parent: Link<'arena>, next_sibling: Link<'arena>, @@ -58,6 +54,7 @@ pub struct Node<'arena> { data: NodeData<'arena>, } +/// HTML node data which can be an element, a comment, a string, a DOCTYPE, etc... pub enum NodeData<'arena> { Document, Doctype { @@ -178,6 +175,11 @@ impl<'arena> Sink<'arena> { } } +/// By implementing the TreeSink trait we determine how the data from the tree building step +/// is processed. In our case, our data is allocated in the arena and added to the Node data +/// structure. +/// +/// For deeper understating of each function go to the TreeSink declaration. impl<'arena> TreeSink for Sink<'arena> { type Handle = Ref<'arena>; type Output = Ref<'arena>; @@ -333,3 +335,18 @@ impl<'arena> TreeSink for Sink<'arena> { } } } + +/// In this example an "arena" is created and filled with the DOM nodes. +/// "Arena" is a type of allocation in which a block of memory is allocated +/// and later filled with data, DOM nodes in this case. When the arena is deallocated +/// it is destroyed with all of its items. +/// +/// Further info about arena: https://docs.rs/typed-arena/latest/typed_arena/ +fn main() { + // Read HTML from the standard input + let mut bytes = Vec::new(); + io::stdin().read_to_end(&mut bytes).unwrap(); + + let arena = typed_arena::Arena::new(); + html5ever_parse_slice_into_arena(&bytes, &arena); +} diff --git a/html5ever/examples/noop-tokenize.rs b/html5ever/examples/noop-tokenize.rs index 68b1c8c9..0c733e46 100644 --- a/html5ever/examples/noop-tokenize.rs +++ b/html5ever/examples/noop-tokenize.rs @@ -16,22 +16,27 @@ use std::io; use html5ever::tendril::*; use html5ever::tokenizer::{BufferQueue, Token, TokenSink, TokenSinkResult, Tokenizer}; +/// In our case, our sink only contains a tokens vector struct Sink(Vec); impl TokenSink for Sink { type Handle = (); + /// Each processed token will be handled by this method fn process_token(&mut self, token: Token, _line_number: u64) -> TokenSinkResult<()> { - // Don't use the token, but make sure we don't get - // optimized out entirely. self.0.push(token); TokenSinkResult::Continue } } +/// In this example we implement the TokenSink trait which lets us implement how each +/// parsed token is treated. In our example we take each token and insert it into a vector. fn main() { + // Read HTML from standard input let mut chunk = ByteTendril::new(); io::stdin().read_to_tendril(&mut chunk).unwrap(); + + // Create a buffer queue for the tokenizer let mut input = BufferQueue::default(); input.push_back(chunk.try_reinterpret().unwrap()); diff --git a/html5ever/examples/noop-tree-builder.rs b/html5ever/examples/noop-tree-builder.rs index 5e516df6..1baebf99 100644 --- a/html5ever/examples/noop-tree-builder.rs +++ b/html5ever/examples/noop-tree-builder.rs @@ -32,6 +32,10 @@ impl Sink { } } +/// By implementing the TreeSink trait we determine how the data from the tree building step +/// is processed. In this case the DOM elements are written into the "names" hashmap. +/// +/// For deeper understating of each function go to the TreeSink declaration. impl TreeSink for Sink { type Handle = usize; type Output = Self; @@ -98,11 +102,15 @@ impl TreeSink for Sink { fn mark_script_already_started(&mut self, _node: &usize) {} } +/// In this example we implement the TreeSink trait which takes each parsed elements and insert +/// it to a hashmap, while each element is given a numeric id. fn main() { let sink = Sink { next_id: 1, names: HashMap::new(), }; + + // Read HTML from the standard input and parse it let stdin = io::stdin(); parse_document(sink, Default::default()) .from_utf8() diff --git a/html5ever/examples/print-tree-actions.rs b/html5ever/examples/print-tree-actions.rs index b95368df..2fcf0ad8 100644 --- a/html5ever/examples/print-tree-actions.rs +++ b/html5ever/examples/print-tree-actions.rs @@ -158,6 +158,9 @@ impl TreeSink for Sink { } } +/// Same example as the "noop-tree-builder", but this time every function implemented in our +/// Sink object prints a log, so it's easier to get an understating of when each function is +/// called. fn main() { let sink = Sink { next_id: 1, diff --git a/html5ever/examples/tokenize.rs b/html5ever/examples/tokenize.rs index 8d4d9e7d..3b14175f 100644 --- a/html5ever/examples/tokenize.rs +++ b/html5ever/examples/tokenize.rs @@ -81,10 +81,16 @@ impl TokenSink for TokenPrinter { } } +/// In this example we implement the TokenSink trait in such a way that each token is printed. +/// If a there's an error while processing a token it is printed as well. fn main() { let mut sink = TokenPrinter { in_char_run: false }; + + // Read HTML from standard input let mut chunk = ByteTendril::new(); io::stdin().read_to_tendril(&mut chunk).unwrap(); + + // Create a buffer queue for the tokenizer let mut input = BufferQueue::default(); input.push_back(chunk.try_reinterpret().unwrap()); @@ -96,6 +102,7 @@ fn main() { }, ); let _ = tok.feed(&mut input); + assert!(input.is_empty()); tok.end(); sink.is_char(false);