Skip to content

Commit

Permalink
Split Symbol interner into static and dynamic
Browse files Browse the repository at this point in the history
This improves performance of Symbol interning in several ways.
The main motivation of this work is to prepare rustc for
efficient parallel builds.

The Symbol lookup table (mapping from strings to Symbol numbers)
is now split into two separate tables: a static table and a
dynamic table. The static table contains strings that are known
to rustc, including all keywords and all `sym::foo` symbols.
The dynamic table contains strings that rustc discovers while
compiling, such as "my_super_obscure_function_name".

Since the static table is known at compile time (that is, when
rustc itself is being compiled), this table can be stored
entirely in static data structures. We use the `phf` crate to
generate this table; `phf` generates perfect hash functions.

This allows rustc to perform Symbol lookups for static symbols
without any multithreaded synchronization, or accessing any
dynamic data whatsoever.

I measured the percentage of static symbol lookups in many
common Rust crates, including rust/compiler, rust/library,
servo, rand, quote, syn, rust-analyzer, rayon, and rsvg.
Among these crates, between 35% and 55% of all symbol lookups
were resolved using the static lookup table.
  • Loading branch information
Arlie Davis committed Jan 20, 2021
1 parent 14265f9 commit 6fa8898
Show file tree
Hide file tree
Showing 8 changed files with 212 additions and 56 deletions.
2 changes: 2 additions & 0 deletions Cargo.lock
Original file line number Diff line number Diff line change
Expand Up @@ -3913,6 +3913,7 @@ dependencies = [
name = "rustc_macros"
version = "0.1.0"
dependencies = [
"phf_codegen",
"proc-macro2",
"quote",
"syn",
Expand Down Expand Up @@ -4200,6 +4201,7 @@ version = "0.0.0"
dependencies = [
"cfg-if 0.1.10",
"md-5",
"phf",
"rustc_arena",
"rustc_data_structures",
"rustc_index",
Expand Down
1 change: 1 addition & 0 deletions compiler/rustc_macros/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ synstructure = "0.12.1"
syn = { version = "1", features = ["full"] }
proc-macro2 = "1"
quote = "1"
phf_codegen = "0.8"
92 changes: 57 additions & 35 deletions compiler/rustc_macros/src/symbols.rs
Original file line number Diff line number Diff line change
Expand Up @@ -124,10 +124,6 @@ fn symbols_with_errors(input: TokenStream) -> (TokenStream, Vec<syn::Error>) {
}
};

let mut keyword_stream = quote! {};
let mut symbols_stream = quote! {};
let mut prefill_stream = quote! {};
let mut counter = 0u32;
let mut keys =
HashMap::<String, Span>::with_capacity(input.keywords.len() + input.symbols.len() + 10);
let mut prev_key: Option<(Span, String)> = None;
Expand All @@ -136,8 +132,10 @@ fn symbols_with_errors(input: TokenStream) -> (TokenStream, Vec<syn::Error>) {
if let Some(prev_span) = keys.get(str) {
errors.error(span, format!("Symbol `{}` is duplicated", str));
errors.error(*prev_span, format!("location of previous definition"));
Err(())
} else {
keys.insert(str.to_string(), span);
Ok(())
}
};

Expand All @@ -151,51 +149,75 @@ fn symbols_with_errors(input: TokenStream) -> (TokenStream, Vec<syn::Error>) {
prev_key = Some((span, str.to_string()));
};

let mut symbol_strings: Vec<String> = Vec::new();

// Generate the listed keywords.
let mut keyword_stream = quote! {};
for keyword in input.keywords.iter() {
let name = &keyword.name;
let value = &keyword.value;
let value_string = value.value();
check_dup(keyword.name.span(), &value_string, &mut errors);
prefill_stream.extend(quote! {
#value,
});
let symbol_index = symbol_strings.len() as u32;
if check_dup(keyword.name.span(), &value_string, &mut errors).is_ok() {
// Only add an entry to `symbol_strings` if it is not a duplicate.
// If it is a duplicate, then compilation will fail. However, we still
// want to avoid panicking, if a duplicate is detected.
symbol_strings.push(value_string);
}
keyword_stream.extend(quote! {
pub const #name: Symbol = Symbol::new(#counter);
pub const #name: Symbol = Symbol::new(#symbol_index);
});
counter += 1;
}

// Generate symbols for the strings "0", "1", ..., "9".
let digits_base = symbol_strings.len() as u32;
for n in 0..10 {
let n_string = n.to_string();
if check_dup(Span::call_site(), &n_string, &mut errors).is_ok() {
symbol_strings.push(n_string);
}
}

// Generate the listed symbols.
let mut symbols_stream = quote! {};
for symbol in input.symbols.iter() {
let name = &symbol.name;
let name_string = symbol.name.to_string();
check_order(symbol.name.span(), &name_string, &mut errors);
let value = match &symbol.value {
Some(value) => value.value(),
None => name.to_string(),
None => name_string,
};
check_dup(symbol.name.span(), &value, &mut errors);
check_order(symbol.name.span(), &name.to_string(), &mut errors);

prefill_stream.extend(quote! {
#value,
});
let symbol_index = symbol_strings.len() as u32;
if check_dup(symbol.name.span(), &value, &mut errors).is_ok() {
// Only add an entry to `symbol_strings` if it is not a duplicate.
// If it is a duplicate, then compilation will fail. However, we still
// want to avoid panicking, if a duplicate is detected.
symbol_strings.push(value);
}

symbols_stream.extend(quote! {
pub const #name: Symbol = Symbol::new(#counter);
pub const #name: Symbol = Symbol::new(#symbol_index);
});
counter += 1;
}

// Generate symbols for the strings "0", "1", ..., "9".
let digits_base = counter;
counter += 10;
for n in 0..10 {
let n = n.to_string();
check_dup(Span::call_site(), &n, &mut errors);
prefill_stream.extend(quote! {
#n,
});
// We have finished collecting symbol strings.
let static_symbols_len = symbol_strings.len();
let dynamic_symbol_base = symbol_strings.len() as u32;
let symbol_strings = symbol_strings; // no more mutation

// Build the body of STATIC_SYMBOLS.
let symbol_strings_tokens: TokenStream = symbol_strings.iter().map(|s| quote!(#s,)).collect();

// Build the PHF map. This translates from strings to Symbol values.
let mut phf_map = phf_codegen::Map::<&str>::new();
for (symbol_index, symbol) in symbol_strings.iter().enumerate() {
phf_map.entry(symbol, format!("Symbol::new({})", symbol_index as u32).as_str());
}
let _ = counter; // for future use
let phf_map_built = phf_map.build();
let phf_map_text = phf_map_built.to_string();
let phf_map_expr = syn::parse_str::<syn::Expr>(&phf_map_text).unwrap();

let output = quote! {
const SYMBOL_DIGITS_BASE: u32 = #digits_base;
Expand All @@ -215,13 +237,13 @@ fn symbols_with_errors(input: TokenStream) -> (TokenStream, Vec<syn::Error>) {
#symbols_stream
}

impl Interner {
pub fn fresh() -> Self {
Interner::prefill(&[
#prefill_stream
])
}
}
const DYNAMIC_SYMBOL_BASE: u32 = #dynamic_symbol_base;

static STATIC_SYMBOLS: [&str; #static_symbols_len as usize] = [
#symbol_strings_tokens
];

static STATIC_SYMBOLS_PHF: ::phf::Map<&'static str, Symbol> = #phf_map_expr;
};

(output, errors.list)
Expand Down
1 change: 1 addition & 0 deletions compiler/rustc_span/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,4 @@ tracing = "0.1"
sha-1 = "0.9"
sha2 = "0.9"
md-5 = "0.9"
phf = "0.8"
2 changes: 1 addition & 1 deletion compiler/rustc_span/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ pub struct SessionGlobals {
impl SessionGlobals {
pub fn new(edition: Edition) -> SessionGlobals {
SessionGlobals {
symbol_interner: Lock::new(symbol::Interner::fresh()),
symbol_interner: Lock::new(symbol::Interner::default()),
span_interner: Lock::new(span_encoding::SpanInterner::default()),
hygiene_data: Lock::new(hygiene::HygieneData::new(edition)),
source_map: Lock::new(None),
Expand Down
87 changes: 74 additions & 13 deletions compiler/rustc_span/src/symbol.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1452,17 +1452,55 @@ impl Symbol {
Symbol(SymbolIndex::from_u32(n))
}

/// Maps a string to its interned representation, but only if this string is a known
/// (static) symbol.
pub fn intern_static(string: &str) -> Option<Symbol> {
if let Some(symbol) = STATIC_SYMBOLS_PHF.get(string) { Some(*symbol) } else { None }
}

/// Maps a string to its interned representation.
// #[inline(never)] - There is no benefit to inlining this function (verified with
// performance measurements), and a reduction in overall code size by disabling inlining.
#[inline(never)]
pub fn intern(string: &str) -> Self {
with_interner(|interner| interner.intern(string))
if let Some(symbol) = Symbol::intern_static(string) {
symbol
} else {
with_interner(|interner| interner.intern_dynamic(string))
}
}

pub fn is_static(self) -> bool {
self.0.as_u32() < DYNAMIC_SYMBOL_BASE
}

/// Translates the `Symbol` to a string, but only if this `Symbol`
/// was originally interned as a static symbol.
pub fn as_str_static(self) -> Option<&'static str> {
let symbol_index = self.0.as_usize();
if symbol_index < STATIC_SYMBOLS.len() {
// This is a well-known symbol. The symbol string is stored in a static field.
// There is no need to lock the interner.
Some(STATIC_SYMBOLS[symbol_index])
} else {
None
}
}

/// Convert to a `SymbolStr`. This is a slowish operation because it
/// requires locking the symbol interner.
///
/// If the symbol is a statically-interned symbol (interned at rustc compile time),
/// then this operation is fast, and does not acquire any locks.
pub fn as_str(self) -> SymbolStr {
with_interner(|interner| unsafe {
SymbolStr { string: std::mem::transmute::<&str, &str>(interner.get(self)) }
})
if let Some(string) = self.as_str_static() {
SymbolStr { string }
} else {
// This is a dynamic string. The string is stored in the Interner.
with_interner(|interner| unsafe {
SymbolStr { string: std::mem::transmute::<&str, &str>(interner.get_dynamic(self)) }
})
}
}

pub fn as_u32(self) -> u32 {
Expand Down Expand Up @@ -1528,6 +1566,13 @@ impl<CTX> ToStableHashKey<CTX> for Symbol {
// The `FxHashMap`+`Vec` pair could be replaced by `FxIndexSet`, but #75278
// found that to regress performance up to 2% in some cases. This might be
// revisited after further improvements to `indexmap`.
//
// `Interner` does not contain any of the statically-known symbol names.
// It does not contain any of the strings defined in the `Keyword` or
// `Symbol` sections. Since those strings are statically-known, we just
// look them up in a (static) table, when needed. See
// `STATIC_SYMBOLS` and `STATIC_SYMBOLS_PHF`, which are both generated by
// `compiler/rustc_macros/src/symbols.rs`.
#[derive(Default)]
pub struct Interner {
arena: DroplessArena,
Expand All @@ -1536,21 +1581,28 @@ pub struct Interner {
}

impl Interner {
fn prefill(init: &[&'static str]) -> Self {
Interner {
strings: init.into(),
names: init.iter().copied().zip((0..).map(Symbol::new)).collect(),
..Default::default()
pub fn intern(&mut self, string: &str) -> Symbol {
if let Some(sym) = Symbol::intern_static(string) {
sym
} else {
self.intern_dynamic(string)
}
}

#[inline]
pub fn intern(&mut self, string: &str) -> Symbol {
fn intern_dynamic(&mut self, string: &str) -> Symbol {
// The caller should have already checked for static symbols.
// Failure to do so is a bug, since this code will mistakenly
// intern the static symbol, resulting in a bogus symbol index.
// (The whole point of this design is that you can do static
// lookups without acquiring the thread-local Interner, so if
// we got here with a static symbol, we goofed.)
debug_assert!(Symbol::intern_static(string).is_none());

if let Some(&name) = self.names.get(string) {
return name;
}

let name = Symbol::new(self.strings.len() as u32);
let name = Symbol::new(DYNAMIC_SYMBOL_BASE + self.strings.len() as u32);

// `from_utf8_unchecked` is safe since we just allocated a `&str` which is known to be
// UTF-8.
Expand All @@ -1567,7 +1619,16 @@ impl Interner {
// Get the symbol as a string. `Symbol::as_str()` should be used in
// preference to this function.
pub fn get(&self, symbol: Symbol) -> &str {
self.strings[symbol.0.as_usize()]
if let Some(string) = symbol.as_str_static() {
string
} else {
&self.strings[(symbol.as_u32() - DYNAMIC_SYMBOL_BASE) as usize]
}
}

fn get_dynamic(&self, symbol: Symbol) -> &str {
debug_assert!(!symbol.is_static());
self.strings[(symbol.as_u32() - DYNAMIC_SYMBOL_BASE) as usize]
}
}

Expand Down
78 changes: 71 additions & 7 deletions compiler/rustc_span/src/symbol/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,14 @@ use crate::{edition, SessionGlobals};
#[test]
fn interner_tests() {
let mut i: Interner = Interner::default();
// first one is zero:
assert_eq!(i.intern("dog"), Symbol::new(0));
let dog = i.intern("dog");
// re-use gets the same entry:
assert_eq!(i.intern("dog"), Symbol::new(0));
assert_eq!(i.intern("dog").as_u32(), dog.as_u32());
// different string gets a different #:
assert_eq!(i.intern("cat"), Symbol::new(1));
assert_eq!(i.intern("cat"), Symbol::new(1));
// dog is still at zero
assert_eq!(i.intern("dog"), Symbol::new(0));
let cat = i.intern("cat");
assert_ne!(dog.as_u32(), cat.as_u32());
assert_eq!(i.intern("cat").as_u32(), cat.as_u32());
assert_eq!(i.intern("dog").as_u32(), dog.as_u32());
}

#[test]
Expand All @@ -23,3 +22,68 @@ fn without_first_quote_test() {
assert_eq!(i.without_first_quote().name, kw::Break);
});
}

#[test]
fn test_static_symbols() {
assert_eq!(Symbol::intern_static(""), Some(kw::Invalid));
assert_eq!(Symbol::intern_static("not in the static table"), None);
assert!(Symbol::intern_static("fn").is_some()); // don't care about exact index

// check round-tripping
for &string in ["as", "fn", "let", "trait", "size_of_val"].iter() {
let sym = Symbol::intern_static(string).unwrap();
assert_eq!(string, &*sym.as_str(), "sym #{}", sym.0.as_u32());
}
}

#[test]
fn test_ident_is_special() {
for &s in [kw::Invalid, kw::PathRoot, kw::DollarCrate, kw::Underscore].iter() {
let ident = Ident::with_dummy_span(s);
assert_eq!(ident.is_special(), true, "s = {:?}", s);
}

for &s in [kw::As, kw::Break, kw::UnderscoreLifetime].iter() {
let ident = Ident::with_dummy_span(s);
assert_eq!(ident.is_special(), false, "s = {:?}", s);
}
}

#[test]
fn test_symbol_as_str() {
SESSION_GLOBALS.set(&SessionGlobals::new(edition::Edition::Edition2018), || {
for &(sym, string) in [
(kw::Invalid, ""),
(kw::PathRoot, "{{root}}"),
(kw::DollarCrate, "$crate"),
(kw::As, "as"),
(kw::Break, "break"),
(kw::While, "while"),
(kw::Union, "union"),
(sym::Alignment, "Alignment"),
(sym::Arc, "Arc"),
(sym::zmm_reg, "zmm_reg"),
(sym::i64, "i64"),
]
.iter()
{
let as_str = sym.as_str();
assert_eq!(&*as_str, string);

let sym2 = Symbol::intern(string);
assert_eq!(sym, sym2, "sym={} sym2={}", sym.as_u32(), sym2.as_u32());
}

let colon = Symbol::intern(":");
assert_eq!(&*colon.as_str(), ":");
});
}

#[test]
fn test_dynamic_symbols() {
crate::with_session_globals(crate::edition::Edition::Edition2018, || {
let s1 = Symbol::intern("fuzzy wuzzy");
assert!(!s1.is_static());
assert_eq!(&*s1.as_str(), "fuzzy wuzzy");
});
}
Loading

0 comments on commit 6fa8898

Please sign in to comment.