Skip to content

Commit

Permalink
rustdoc-search: tighter encoding for f index
Browse files Browse the repository at this point in the history
Two optimizations for the function signature search:

* Instead of using JSON arrays, like `[1,20]`, it uses VLQ
  hex with no commas, like `[aAd]`.
* This also adds backrefs: if you have more than one function
  with exactly the same signature, it'll not only store it once,
  it'll *decode* it once, and store in the typeIdMap only once.

Size change
-----------

standard library

```console
$ du -bs search-index-old.js search-index-new.js
4976370 search-index-old.js
4404391 search-index-new.js
```

((4976370-4404391)/4404391)*100% = 12.9%

Benchmarks are similarly shrunk:

```console
$ du -hs tmp/{arti,cortex-m,sqlx,stm32f4,ripgrep}/toolchain_{old,new}/doc/search-index.js
10555067        tmp/arti/toolchain_old/doc/search-index.js
8921236 tmp/arti/toolchain_new/doc/search-index.js
77018   tmp/cortex-m/toolchain_old/doc/search-index.js
66676   tmp/cortex-m/toolchain_new/doc/search-index.js
2876330 tmp/sqlx/toolchain_old/doc/search-index.js
2436812 tmp/sqlx/toolchain_new/doc/search-index.js
63632890        tmp/stm32f4/toolchain_old/doc/search-index.js
52337438        tmp/stm32f4/toolchain_new/doc/search-index.js
631150  tmp/ripgrep/toolchain_old/doc/search-index.js
541646  tmp/ripgrep/toolchain_new/doc/search-index.js
```
  • Loading branch information
notriddle committed Dec 31, 2023
1 parent 1ab60f2 commit 86b9550
Show file tree
Hide file tree
Showing 3 changed files with 170 additions and 86 deletions.
146 changes: 97 additions & 49 deletions src/librustdoc/html/render/mod.rs
Expand Up @@ -58,7 +58,7 @@ use rustc_span::{
symbol::{sym, Symbol},
BytePos, FileName, RealFileName,
};
use serde::ser::{SerializeMap, SerializeSeq};
use serde::ser::SerializeMap;
use serde::{Serialize, Serializer};

use crate::clean::{self, ItemId, RenderedLink, SelfTy};
Expand Down Expand Up @@ -123,115 +123,163 @@ pub(crate) struct IndexItem {
}

/// A type used for the search index.
#[derive(Debug)]
#[derive(Debug, Eq, PartialEq)]
pub(crate) struct RenderType {
id: Option<RenderTypeId>,
generics: Option<Vec<RenderType>>,
bindings: Option<Vec<(RenderTypeId, Vec<RenderType>)>>,
}

impl Serialize for RenderType {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
let id = match &self.id {
// 0 is a sentinel, everything else is one-indexed
None => 0,
// concrete type
Some(RenderTypeId::Index(idx)) if *idx >= 0 => idx + 1,
// generic type parameter
Some(RenderTypeId::Index(idx)) => *idx,
_ => panic!("must convert render types to indexes before serializing"),
};
impl RenderType {
pub fn write_to_string(&self, string: &mut String) {
if self.generics.is_some() || self.bindings.is_some() {
let mut seq = serializer.serialize_seq(None)?;
seq.serialize_element(&id)?;
seq.serialize_element(self.generics.as_ref().map(Vec::as_slice).unwrap_or_default())?;
string.push('{');
// 0 is a sentinel, everything else is one-indexed
match self.id {
Some(id) => id.write_to_string(string),
None => string.push('`'),
}
string.push('{');
for generic in &self.generics.as_ref().map(Vec::as_slice).unwrap_or_default()[..] {
generic.write_to_string(string);
}
string.push('}');
if self.bindings.is_some() {
seq.serialize_element(
self.bindings.as_ref().map(Vec::as_slice).unwrap_or_default(),
)?;
string.push('{');
for binding in &self.bindings.as_ref().map(Vec::as_slice).unwrap_or_default()[..] {
string.push('{');
binding.0.write_to_string(string);
string.push('{');
for constraint in &binding.1[..] {
constraint.write_to_string(string);
}
string.push('}');
string.push('}');
}
string.push('}');
}
seq.end()
string.push('}');
} else {
id.serialize(serializer)
// 0 is a sentinel, everything else is one-indexed
match self.id {
Some(id) => id.write_to_string(string),
None => string.push('`'),
}
}
}
}

#[derive(Clone, Copy, Debug)]
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub(crate) enum RenderTypeId {
DefId(DefId),
Primitive(clean::PrimitiveType),
AssociatedType(Symbol),
Index(isize),
}

impl Serialize for RenderTypeId {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
let id = match &self {
impl RenderTypeId {
pub fn write_to_string(&self, string: &mut String) {
// (sign, value)
let (sign, id): (bool, u32) = match &self {
// 0 is a sentinel, everything else is one-indexed
// concrete type
RenderTypeId::Index(idx) if *idx >= 0 => idx + 1,
RenderTypeId::Index(idx) if *idx >= 0 => (false, (idx + 1isize).try_into().unwrap()),
// generic type parameter
RenderTypeId::Index(idx) => *idx,
RenderTypeId::Index(idx) => (true, (-*idx).try_into().unwrap()),
_ => panic!("must convert render types to indexes before serializing"),
};
id.serialize(serializer)
// zig-zag notation
let value: u32 = (id << 1) | (if sign { 1 } else { 0 });
// encode
let mut shift: u32 = 28;
let mut mask: u32 = 0xF0_00_00_00;
while shift < 32 {
let hexit = (value & mask) >> shift;
if hexit != 0 || shift == 0 {
let hex =
char::try_from(if shift == 0 { '`' } else { '@' } as u32 + hexit).unwrap();
string.push(hex);
}
shift = shift.wrapping_sub(4);
mask = mask >> 4;
}
}
}

/// Full type of functions/methods in the search index.
#[derive(Debug)]
#[derive(Debug, Eq, PartialEq)]
pub(crate) struct IndexItemFunctionType {
inputs: Vec<RenderType>,
output: Vec<RenderType>,
where_clause: Vec<Vec<RenderType>>,
}

impl Serialize for IndexItemFunctionType {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
impl IndexItemFunctionType {
pub fn write_to_string<'a>(
&'a self,
string: &mut String,
backref_queue: &mut VecDeque<&'a IndexItemFunctionType>,
) {
assert!(backref_queue.len() < 16);
// If we couldn't figure out a type, just write `0`.
let has_missing = self
.inputs
.iter()
.chain(self.output.iter())
.any(|i| i.id.is_none() && i.generics.is_none());
if has_missing {
0.serialize(serializer)
string.push('`');
} else if let Some(idx) = backref_queue.iter().position(|other| *other == self) {
string.push(
char::try_from('0' as u32 + u32::try_from(idx).unwrap())
.expect("last possible value is '?'"),
);
} else {
let mut seq = serializer.serialize_seq(None)?;
backref_queue.push_front(self);
if backref_queue.len() >= 16 {
backref_queue.pop_back();
}
string.push('{');
match &self.inputs[..] {
[one] if one.generics.is_none() && one.bindings.is_none() => {
seq.serialize_element(one)?
one.write_to_string(string);
}
_ => {
string.push('{');
for item in &self.inputs[..] {
item.write_to_string(string);
}
string.push('}');
}
_ => seq.serialize_element(&self.inputs)?,
}
match &self.output[..] {
[] if self.where_clause.is_empty() => {}
[one] if one.generics.is_none() && one.bindings.is_none() => {
seq.serialize_element(one)?
one.write_to_string(string);
}
_ => {
string.push('{');
for item in &self.output[..] {
item.write_to_string(string);
}
string.push('}');
}
_ => seq.serialize_element(&self.output)?,
}
for constraint in &self.where_clause {
if let [one] = &constraint[..]
&& one.generics.is_none()
&& one.bindings.is_none()
{
seq.serialize_element(one)?;
one.write_to_string(string);
} else {
seq.serialize_element(constraint)?;
string.push('{');
for item in &constraint[..] {
item.write_to_string(string);
}
string.push('}');
}
}
seq.end()
string.push('}');
}
}
}
Expand Down
29 changes: 7 additions & 22 deletions src/librustdoc/html/render/search_index.rs
@@ -1,5 +1,5 @@
use std::collections::hash_map::Entry;
use std::collections::BTreeMap;
use std::collections::{BTreeMap, VecDeque};

use rustc_data_structures::fx::{FxHashMap, FxIndexMap};
use rustc_middle::ty::TyCtxt;
Expand Down Expand Up @@ -409,9 +409,11 @@ pub(crate) fn build_index<'tcx>(
let mut full_paths = Vec::with_capacity(self.items.len());
let mut descriptions = Vec::with_capacity(self.items.len());
let mut parents = Vec::with_capacity(self.items.len());
let mut functions = Vec::with_capacity(self.items.len());
let mut functions = String::with_capacity(self.items.len());
let mut deprecated = Vec::with_capacity(self.items.len());

let mut backref_queue = VecDeque::new();

for (index, item) in self.items.iter().enumerate() {
let n = item.ty as u8;
let c = char::try_from(n + b'A').expect("item types must fit in ASCII");
Expand All @@ -434,27 +436,10 @@ pub(crate) fn build_index<'tcx>(
full_paths.push((index, &item.path));
}

// Fake option to get `0` out as a sentinel instead of `null`.
// We want to use `0` because it's three less bytes.
enum FunctionOption<'a> {
Function(&'a IndexItemFunctionType),
None,
}
impl<'a> Serialize for FunctionOption<'a> {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
match self {
FunctionOption::None => 0.serialize(serializer),
FunctionOption::Function(ty) => ty.serialize(serializer),
}
}
match &item.search_type {
Some(ty) => ty.write_to_string(&mut functions, &mut backref_queue),
None => functions.push('`'),
}
functions.push(match &item.search_type {
Some(ty) => FunctionOption::Function(ty),
None => FunctionOption::None,
});

if item.deprecation.is_some() {
deprecated.push(index);
Expand Down
81 changes: 66 additions & 15 deletions src/librustdoc/html/static/js/search.js
Expand Up @@ -2767,19 +2767,65 @@ ${item.displayPath}<span class="${type}">${name}</span>\
* The raw function search type format is generated using serde in
* librustdoc/html/render/mod.rs: impl Serialize for IndexItemFunctionType
*
* @param {RawFunctionSearchType} functionSearchType
* @param {{
* string: string,
* offset: number,
* backrefQueue: FunctionSearchType[]
* }} itemFunctionDecoder
* @param {Array<{name: string, ty: number}>} lowercasePaths
* @param {Map<string, integer>}
*
* @return {null|FunctionSearchType}
*/
function buildFunctionSearchType(functionSearchType, lowercasePaths) {
const INPUTS_DATA = 0;
const OUTPUT_DATA = 1;
// `0` is used as a sentinel because it's fewer bytes than `null`
if (functionSearchType === 0) {
function buildFunctionSearchType(itemFunctionDecoder, lowercasePaths) {
const c = itemFunctionDecoder.string.charCodeAt(itemFunctionDecoder.offset);
itemFunctionDecoder.offset += 1;
const [zero, ua, la, ob, cb] = ["0", "@", "`", "{", "}"].map(c => c.charCodeAt(0));
// `` ` `` is used as a sentinel because it's fewer bytes than `null`, and decodes to zero
// `0` is a backref
if (c === la) {
return null;
}
// sixteen characters after "0" are backref
if (c >= zero && c < ua) {
return itemFunctionDecoder.backrefQueue[c - zero];
}
if (c !== ob) {
throw ["Unexpected ", c, " in function: expected ", "{", "; this is a bug"];
}
// call after consuming `{`
function decodeList() {
let c = itemFunctionDecoder.string.charCodeAt(itemFunctionDecoder.offset);
const ret = [];
while (c !== cb) {
ret.push(decode());
c = itemFunctionDecoder.string.charCodeAt(itemFunctionDecoder.offset);
}
itemFunctionDecoder.offset += 1; // eat cb
return ret;
}
// consumes and returns a list or integer
function decode() {
let n = 0;
let c = itemFunctionDecoder.string.charCodeAt(itemFunctionDecoder.offset);
if (c === ob) {
itemFunctionDecoder.offset += 1;
return decodeList();
}
while (c < la) {
n = (n << 4) | (c & 0xF);
itemFunctionDecoder.offset += 1;
c = itemFunctionDecoder.string.charCodeAt(itemFunctionDecoder.offset);
}
// last character >= la
n = (n << 4) | (c & 0xF);
const [sign, value] = [n & 1, n >> 1];
itemFunctionDecoder.offset += 1;
return sign ? -value : value;
}
const functionSearchType = decodeList();
const INPUTS_DATA = 0;
const OUTPUT_DATA = 1;
let inputs, output;
if (typeof functionSearchType[INPUTS_DATA] === "number") {
inputs = [buildItemSearchType(functionSearchType[INPUTS_DATA], lowercasePaths)];
Expand Down Expand Up @@ -2808,9 +2854,14 @@ ${item.displayPath}<span class="${type}">${name}</span>\
? [buildItemSearchType(functionSearchType[i], lowercasePaths)]
: buildItemSearchTypeAll(functionSearchType[i], lowercasePaths));
}
return {
const ret = {
inputs, output, where_clause,
};
itemFunctionDecoder.backrefQueue.unshift(ret);
if (itemFunctionDecoder.backrefQueue.length >= 16) {
itemFunctionDecoder.backrefQueue.pop();
}
return ret;
}

/**
Expand Down Expand Up @@ -2992,8 +3043,12 @@ ${item.displayPath}<span class="${type}">${name}</span>\
const itemDescs = crateCorpus.d;
// an array of (Number) the parent path index + 1 to `paths`, or 0 if none
const itemParentIdxs = crateCorpus.i;
// an array of (Array | 0) the type of the function, if any
const itemFunctionSearchTypes = crateCorpus.f;
// a string representing the list of function types
const itemFunctionDecoder = {
string: crateCorpus.f,
offset: 0,
backrefQueue: [],
};
// an array of (Number) indices for the deprecated items
const deprecatedItems = new Set(crateCorpus.c);
// an array of (Number) indices for the deprecated items
Expand Down Expand Up @@ -3041,12 +3096,8 @@ ${item.displayPath}<span class="${type}">${name}</span>\
word = itemNames[i].toLowerCase();
}
const path = itemPaths.has(i) ? itemPaths.get(i) : lastPath;
let type = null;
if (itemFunctionSearchTypes[i] !== 0) {
type = buildFunctionSearchType(
itemFunctionSearchTypes[i],
lowercasePaths
);
const type = buildFunctionSearchType(itemFunctionDecoder, lowercasePaths);
if (type !== null) {
if (type) {
const fp = functionTypeFingerprint.subarray(id * 4, (id + 1) * 4);
const fps = new Set();
Expand Down

0 comments on commit 86b9550

Please sign in to comment.