From 7204168fee6ef766d8f0be241d6b4b2ea11ef518 Mon Sep 17 00:00:00 2001 From: Simon Vandel Sillesen Date: Sun, 17 May 2020 11:39:11 +0200 Subject: [PATCH 1/9] Do not allocate GreenToken before we have checked if we have it in cache --- src/green/builder.rs | 19 ++++++++++++------- src/green/token.rs | 8 +++++++- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/src/green/builder.rs b/src/green/builder.rs index 35ecbc3..31f63c5 100644 --- a/src/green/builder.rs +++ b/src/green/builder.rs @@ -1,5 +1,6 @@ -use rustc_hash::FxHashSet; +use rustc_hash::{FxHashMap, FxHashSet}; +use super::token::GreenTokenData; use crate::{ green::{GreenElement, GreenNode, GreenToken, SyntaxKind}, NodeOrToken, SmolStr, @@ -8,7 +9,7 @@ use crate::{ #[derive(Default, Debug)] pub struct NodeCache { nodes: FxHashSet, - tokens: FxHashSet, + tokens: FxHashMap, } impl NodeCache { @@ -36,12 +37,16 @@ impl NodeCache { } fn token(&mut self, kind: SyntaxKind, text: SmolStr) -> GreenToken { - let mut token = GreenToken::new(kind, text); - match self.tokens.get(&token) { - Some(existing) => token = existing.clone(), - None => assert!(self.tokens.insert(token.clone())), + let token_data = GreenTokenData::new(kind, text.clone()); + + match self.tokens.get(&token_data) { + Some(existing) => existing.clone(), + None => { + let token = GreenToken::new(kind, text.clone()); + self.tokens.insert(token_data, token.clone()); + token + } } - token } } diff --git a/src/green/token.rs b/src/green/token.rs index e04944e..87f7fab 100644 --- a/src/green/token.rs +++ b/src/green/token.rs @@ -4,11 +4,17 @@ use crate::{green::SyntaxKind, SmolStr, TextSize}; #[repr(align(2))] // NB: this is an at-least annotation #[derive(Debug, PartialEq, Eq, Hash)] -struct GreenTokenData { +pub(crate) struct GreenTokenData { kind: SyntaxKind, text: SmolStr, } +impl GreenTokenData { + pub fn new(kind: SyntaxKind, text: SmolStr) -> GreenTokenData { + GreenTokenData { kind, text } + } +} + /// Leaf node in the immutable tree. pub struct GreenToken { ptr: ptr::NonNull, From 53b46016db3df17192aba93200eabd2311632410 Mon Sep 17 00:00:00 2001 From: Simon Vandel Sillesen Date: Sun, 17 May 2020 13:59:58 +0200 Subject: [PATCH 2/9] Only allocate GreenNode if the node is not in the cache --- src/green/builder.rs | 72 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 62 insertions(+), 10 deletions(-) diff --git a/src/green/builder.rs b/src/green/builder.rs index 31f63c5..ccb706f 100644 --- a/src/green/builder.rs +++ b/src/green/builder.rs @@ -1,24 +1,66 @@ -use rustc_hash::{FxHashMap, FxHashSet}; +use rustc_hash::{FxHashMap, FxHasher}; use super::token::GreenTokenData; use crate::{ green::{GreenElement, GreenNode, GreenToken, SyntaxKind}, NodeOrToken, SmolStr, }; +use std::{ + fmt::Debug, + hash::{Hash, Hasher}, +}; #[derive(Default, Debug)] pub struct NodeCache { - nodes: FxHashSet, + nodes: FxHashMap, tokens: FxHashMap, } +struct GreenNodeHash { + hasher: FxHasher, + inner_hash: u64, +} + +impl Eq for GreenNodeHash {} +impl PartialEq for GreenNodeHash { + fn eq(&self, other: &Self) -> bool { + self.inner_hash == other.inner_hash + } +} + +impl Debug for GreenNodeHash { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.inner_hash.fmt(f) + } +} + +impl GreenNodeHash { + fn new<'a>(kind: SyntaxKind) -> Self { + let mut hasher = FxHasher::default(); + kind.hash(&mut hasher); + let inner_hash = hasher.finish(); + Self { hasher, inner_hash } + } + + fn add_child(&mut self, child: &GreenElement) { + child.hash(&mut self.hasher); + self.inner_hash = self.hasher.finish(); + } +} + +impl Hash for GreenNodeHash { + fn hash(&self, state: &mut H) { + self.inner_hash.hash(state); + } +} + impl NodeCache { fn node(&mut self, kind: SyntaxKind, children: I) -> GreenNode where - I: IntoIterator, - I::IntoIter: ExactSizeIterator, + I: ExactSizeIterator, { - let mut node = GreenNode::new(kind, children); + let num_children = children.len(); + // Green nodes are fully immutable, so it's ok to deduplicate them. // This is the same optimization that Roslyn does // https://github.com/KirillOsenkov/Bliki/wiki/Roslyn-Immutable-Trees @@ -27,13 +69,23 @@ impl NodeCache { // For `libsyntax/parse/parser.rs`, measurements show that deduping saves // 17% of the memory for green nodes! // Future work: make hashing faster by avoiding rehashing of subtrees. - if node.children().len() <= 3 { - match self.nodes.get(&node) { - Some(existing) => node = existing.clone(), - None => assert!(self.nodes.insert(node.clone())), + if num_children <= 3 { + let mut hash = GreenNodeHash::new(kind); + let x: Vec = children.collect(); + for child in x.iter() { + hash.add_child(child); + } + match self.nodes.get(&hash) { + Some(existing) => existing.clone(), + None => { + let node = GreenNode::new(kind, x.into_iter()); + self.nodes.insert(hash, node.clone()); + node + } } + } else { + GreenNode::new(kind, children) } - node } fn token(&mut self, kind: SyntaxKind, text: SmolStr) -> GreenToken { From bda5216fbb29b580e2a6a6e1b04086d22ad48bd3 Mon Sep 17 00:00:00 2001 From: Simon Vandel Sillesen Date: Sun, 17 May 2020 14:44:06 +0200 Subject: [PATCH 3/9] Use smallVec to avoid Vec allocation --- Cargo.toml | 1 + src/green/builder.rs | 15 ++++++++++----- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index dcc72a1..d442482 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,6 +10,7 @@ edition = "2018" [dependencies] rustc-hash = "1.0.1" text-size = "1.0.0" +smallvec = "1.4.0" smol_str = "0.1.10" serde = { version = "1.0.89", optional = true, default-features = false } thin-dst = "1.0.0" diff --git a/src/green/builder.rs b/src/green/builder.rs index ccb706f..7dbe0f0 100644 --- a/src/green/builder.rs +++ b/src/green/builder.rs @@ -5,6 +5,7 @@ use crate::{ green::{GreenElement, GreenNode, GreenToken, SyntaxKind}, NodeOrToken, SmolStr, }; +use smallvec::SmallVec; use std::{ fmt::Debug, hash::{Hash, Hasher}, @@ -60,7 +61,7 @@ impl NodeCache { I: ExactSizeIterator, { let num_children = children.len(); - + const MAX_CHILDREN: usize = 3; // Green nodes are fully immutable, so it's ok to deduplicate them. // This is the same optimization that Roslyn does // https://github.com/KirillOsenkov/Bliki/wiki/Roslyn-Immutable-Trees @@ -69,16 +70,20 @@ impl NodeCache { // For `libsyntax/parse/parser.rs`, measurements show that deduping saves // 17% of the memory for green nodes! // Future work: make hashing faster by avoiding rehashing of subtrees. - if num_children <= 3 { + if num_children <= MAX_CHILDREN { let mut hash = GreenNodeHash::new(kind); - let x: Vec = children.collect(); - for child in x.iter() { + let mut collected_children = + SmallVec::<[GreenElement; MAX_CHILDREN]>::with_capacity(MAX_CHILDREN); + for child in children { + collected_children.push(child); + } + for child in collected_children.iter() { hash.add_child(child); } match self.nodes.get(&hash) { Some(existing) => existing.clone(), None => { - let node = GreenNode::new(kind, x.into_iter()); + let node = GreenNode::new(kind, collected_children.into_iter()); self.nodes.insert(hash, node.clone()); node } From 29d2f58913ef282fd39f4f9460588b9fbd3e3f89 Mon Sep 17 00:00:00 2001 From: Simon Vandel Sillesen Date: Sun, 17 May 2020 16:25:30 +0200 Subject: [PATCH 4/9] visibility --- src/green/token.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/green/token.rs b/src/green/token.rs index 87f7fab..5dc52bc 100644 --- a/src/green/token.rs +++ b/src/green/token.rs @@ -10,7 +10,7 @@ pub(crate) struct GreenTokenData { } impl GreenTokenData { - pub fn new(kind: SyntaxKind, text: SmolStr) -> GreenTokenData { + pub(crate) fn new(kind: SyntaxKind, text: SmolStr) -> GreenTokenData { GreenTokenData { kind, text } } } From d5434ace97e956d29dabe2c3d62dbd18cb6f9943 Mon Sep 17 00:00:00 2001 From: Simon Vandel Sillesen Date: Tue, 19 May 2020 20:53:44 +0200 Subject: [PATCH 5/9] collect instead of push loop --- src/green/builder.rs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/green/builder.rs b/src/green/builder.rs index 7dbe0f0..496571c 100644 --- a/src/green/builder.rs +++ b/src/green/builder.rs @@ -72,11 +72,7 @@ impl NodeCache { // Future work: make hashing faster by avoiding rehashing of subtrees. if num_children <= MAX_CHILDREN { let mut hash = GreenNodeHash::new(kind); - let mut collected_children = - SmallVec::<[GreenElement; MAX_CHILDREN]>::with_capacity(MAX_CHILDREN); - for child in children { - collected_children.push(child); - } + let collected_children: SmallVec<[GreenElement; MAX_CHILDREN]> = children.collect(); for child in collected_children.iter() { hash.add_child(child); } From 35f39db570010c0598da7cc75099e880d84f4970 Mon Sep 17 00:00:00 2001 From: Simon Vandel Sillesen Date: Tue, 19 May 2020 21:37:19 +0200 Subject: [PATCH 6/9] avoid api change --- src/green/builder.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/green/builder.rs b/src/green/builder.rs index 496571c..f91387e 100644 --- a/src/green/builder.rs +++ b/src/green/builder.rs @@ -58,8 +58,10 @@ impl Hash for GreenNodeHash { impl NodeCache { fn node(&mut self, kind: SyntaxKind, children: I) -> GreenNode where - I: ExactSizeIterator, + I: IntoIterator, + I::IntoIter: ExactSizeIterator, { + let children = children.into_iter(); let num_children = children.len(); const MAX_CHILDREN: usize = 3; // Green nodes are fully immutable, so it's ok to deduplicate them. From 83deb5e402175c89115a4d1ceb592f69c1b670a6 Mon Sep 17 00:00:00 2001 From: Simon Vandel Sillesen Date: Tue, 19 May 2020 21:37:59 +0200 Subject: [PATCH 7/9] simplify hashing --- src/green/builder.rs | 43 ++++++++++++------------------------------- 1 file changed, 12 insertions(+), 31 deletions(-) diff --git a/src/green/builder.rs b/src/green/builder.rs index f91387e..e836e87 100644 --- a/src/green/builder.rs +++ b/src/green/builder.rs @@ -17,41 +17,24 @@ pub struct NodeCache { tokens: FxHashMap, } +#[derive(Debug, Hash, Eq, PartialEq)] struct GreenNodeHash { - hasher: FxHasher, inner_hash: u64, } -impl Eq for GreenNodeHash {} -impl PartialEq for GreenNodeHash { - fn eq(&self, other: &Self) -> bool { - self.inner_hash == other.inner_hash - } -} - -impl Debug for GreenNodeHash { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - self.inner_hash.fmt(f) - } -} - impl GreenNodeHash { - fn new<'a>(kind: SyntaxKind) -> Self { + fn new<'a, I>(kind: SyntaxKind, children: I) -> Self + where + I: Iterator, + { let mut hasher = FxHasher::default(); kind.hash(&mut hasher); - let inner_hash = hasher.finish(); - Self { hasher, inner_hash } - } - - fn add_child(&mut self, child: &GreenElement) { - child.hash(&mut self.hasher); - self.inner_hash = self.hasher.finish(); - } -} -impl Hash for GreenNodeHash { - fn hash(&self, state: &mut H) { - self.inner_hash.hash(state); + for child in children { + child.hash(&mut hasher); + } + let inner_hash = hasher.finish(); + Self { inner_hash } } } @@ -73,11 +56,9 @@ impl NodeCache { // 17% of the memory for green nodes! // Future work: make hashing faster by avoiding rehashing of subtrees. if num_children <= MAX_CHILDREN { - let mut hash = GreenNodeHash::new(kind); let collected_children: SmallVec<[GreenElement; MAX_CHILDREN]> = children.collect(); - for child in collected_children.iter() { - hash.add_child(child); - } + let hash = GreenNodeHash::new(kind, collected_children.iter()); + match self.nodes.get(&hash) { Some(existing) => existing.clone(), None => { From 6b359752b5d012e488dbbfdea761d4a84108df52 Mon Sep 17 00:00:00 2001 From: Simon Vandel Sillesen Date: Tue, 19 May 2020 21:58:41 +0200 Subject: [PATCH 8/9] Avoid hashing complete subtrees, only hash node to potentially cache --- src/green/builder.rs | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/green/builder.rs b/src/green/builder.rs index e836e87..be79357 100644 --- a/src/green/builder.rs +++ b/src/green/builder.rs @@ -23,6 +23,7 @@ struct GreenNodeHash { } impl GreenNodeHash { + /// Constructs a hash of a node by hashing the kind, and the pointers to all the children. fn new<'a, I>(kind: SyntaxKind, children: I) -> Self where I: Iterator, @@ -31,7 +32,15 @@ impl GreenNodeHash { kind.hash(&mut hasher); for child in children { - child.hash(&mut hasher); + match child { + NodeOrToken::Node(node) => { + node.kind().hash(&mut hasher); + node.ptr().hash(&mut hasher); + } + NodeOrToken::Token(token) => { + token.hash(&mut hasher); + } + } } let inner_hash = hasher.finish(); Self { inner_hash } @@ -54,7 +63,6 @@ impl NodeCache { // For example, all `#[inline]` in this file share the same green node! // For `libsyntax/parse/parser.rs`, measurements show that deduping saves // 17% of the memory for green nodes! - // Future work: make hashing faster by avoiding rehashing of subtrees. if num_children <= MAX_CHILDREN { let collected_children: SmallVec<[GreenElement; MAX_CHILDREN]> = children.collect(); let hash = GreenNodeHash::new(kind, collected_children.iter()); From 51e0ba3e849cef5b21e7c6ffc5531ec55a172f3b Mon Sep 17 00:00:00 2001 From: Simon Vandel Sillesen Date: Wed, 20 May 2020 08:37:50 +0200 Subject: [PATCH 9/9] Fix issue with hash collitions --- src/green/builder.rs | 54 +++++++++++++++++++++++++++++--------------- 1 file changed, 36 insertions(+), 18 deletions(-) diff --git a/src/green/builder.rs b/src/green/builder.rs index be79357..9f06c70 100644 --- a/src/green/builder.rs +++ b/src/green/builder.rs @@ -1,4 +1,4 @@ -use rustc_hash::{FxHashMap, FxHasher}; +use rustc_hash::FxHashMap; use super::token::GreenTokenData; use crate::{ @@ -9,6 +9,7 @@ use smallvec::SmallVec; use std::{ fmt::Debug, hash::{Hash, Hasher}, + ptr, }; #[derive(Default, Debug)] @@ -17,36 +18,53 @@ pub struct NodeCache { tokens: FxHashMap, } -#[derive(Debug, Hash, Eq, PartialEq)] +#[derive(Debug)] struct GreenNodeHash { - inner_hash: u64, + kind: SyntaxKind, + children: ChildrenVec, } impl GreenNodeHash { /// Constructs a hash of a node by hashing the kind, and the pointers to all the children. - fn new<'a, I>(kind: SyntaxKind, children: I) -> Self - where - I: Iterator, - { - let mut hasher = FxHasher::default(); - kind.hash(&mut hasher); + fn new(kind: SyntaxKind, children: ChildrenVec) -> Self { + Self { kind, children } + } +} + +impl Hash for GreenNodeHash { + fn hash(&self, state: &mut H) { + self.kind.hash(state); - for child in children { + for child in self.children.iter() { match child { NodeOrToken::Node(node) => { - node.kind().hash(&mut hasher); - node.ptr().hash(&mut hasher); + node.kind().hash(state); + node.ptr().hash(state); } NodeOrToken::Token(token) => { - token.hash(&mut hasher); + token.hash(state); } } } - let inner_hash = hasher.finish(); - Self { inner_hash } } } +impl Eq for GreenNodeHash {} +impl PartialEq for GreenNodeHash { + fn eq(&self, other: &Self) -> bool { + self.kind == other.kind + && self.children.len() == other.children.len() + && self.children.iter().zip(other.children.iter()).all(|pair| match pair { + (NodeOrToken::Node(lhs), NodeOrToken::Node(rhs)) => ptr::eq(lhs.ptr(), rhs.ptr()), + (NodeOrToken::Token(lhs), NodeOrToken::Token(rhs)) => *lhs == *rhs, + _ => false, + }) + } +} + +const MAX_CHILDREN: usize = 3; +type ChildrenVec = SmallVec<[GreenElement; MAX_CHILDREN]>; + impl NodeCache { fn node(&mut self, kind: SyntaxKind, children: I) -> GreenNode where @@ -55,7 +73,7 @@ impl NodeCache { { let children = children.into_iter(); let num_children = children.len(); - const MAX_CHILDREN: usize = 3; + // Green nodes are fully immutable, so it's ok to deduplicate them. // This is the same optimization that Roslyn does // https://github.com/KirillOsenkov/Bliki/wiki/Roslyn-Immutable-Trees @@ -64,8 +82,8 @@ impl NodeCache { // For `libsyntax/parse/parser.rs`, measurements show that deduping saves // 17% of the memory for green nodes! if num_children <= MAX_CHILDREN { - let collected_children: SmallVec<[GreenElement; MAX_CHILDREN]> = children.collect(); - let hash = GreenNodeHash::new(kind, collected_children.iter()); + let collected_children: ChildrenVec = children.collect(); + let hash = GreenNodeHash::new(kind, collected_children.clone()); match self.nodes.get(&hash) { Some(existing) => existing.clone(),