Skip to content

Commit

Permalink
Optimize string merging
Browse files Browse the repository at this point in the history
Linking clang-13 with debug info takes ~3.6 seconds on a simulated
10-core/20-threads machine. mold spends most of its time (~2.3 seconds)
merging string literals in .debug_str. Input .debug_str sections contain
70 million string literals in total, which is reduced to 2 million after
de-duplication. The input object files contain a lot of duplicates.
clang-13 with debug info is enormous -- it is ~3.1 GiB after linking.

It looks like TBB's concurrent hashmap doesn't scale well with the
input.

In this patch, I implemented our own concurrent hashmap. The hashmap
is extremely lightweight and support only the key-value insertion
operation. It doesn't even support rehashing. It aborts once the hash
table becomes full.

In order to know the correct size for the hashmap before inserting
strings into it, I also implemented HyperLogLog algorithm in this patch.
HyperLogLog is an algorithm that gives a fairly accurate estimate on
the number of unique elements.

With this patch, mold can link clang-13 in ~2.5 seconds, which is ~30%
faster than before.

#73
  • Loading branch information
rui314 committed Jul 13, 2021
1 parent 6bb0915 commit 41b2fa7
Show file tree
Hide file tree
Showing 8 changed files with 288 additions and 111 deletions.
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@ CPPFLAGS = -g -Imimalloc/include -pthread -std=c++20 \
-DGIT_HASH=\"$(GIT_HASH)\" \
$(EXTRA_CPPFLAGS)
LDFLAGS += $(EXTRA_LDFLAGS) -rdynamic
LIBS = -Wl,-as-needed -lcrypto -pthread -lz -lxxhash -ldl
LIBS = -Wl,-as-needed -lcrypto -pthread -lz -lxxhash -ldl -lm
OBJS = main.o object_file.o input_sections.o output_chunks.o \
mapfile.o perf.o linker_script.o archive_file.o output_file.o \
subprocess.o gc_sections.o icf.o symbols.o cmdline.o filepath.o \
passes.o tar.o compress.o memory_mapped_file.o relocatable.o \
concurrent_map.o hyperloglog.o \
arch_x86_64.o arch_i386.o arch_aarch64.o

PREFIX ?= /usr
Expand Down
80 changes: 80 additions & 0 deletions concurrent_map.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#include "mold.h"

static const char *locked = (char *)-1;

static constexpr i64 MIN_NBUCKETS = 256;

template <typename T>
ConcurrentMap<T>::ConcurrentMap() {}

template <typename T>
ConcurrentMap<T>::ConcurrentMap(i64 nbuckets) {
resize(nbuckets);
}

template <typename T>
void ConcurrentMap<T>::resize(i64 nbuckets) {
this->~ConcurrentMap();

nbuckets = std::max<i64>(MIN_NBUCKETS, next_power_of_two(nbuckets));

this->nbuckets = nbuckets;
keys = (std::atomic<const char *> *)calloc(nbuckets, sizeof(keys[0]));
sizes = (u32 *)calloc(nbuckets, sizeof(sizes[0]));
values = (T *)calloc(nbuckets, sizeof(values[0]));
}

template <typename T>
ConcurrentMap<T>::~ConcurrentMap() {
if (keys) {
free((void *)keys);
free((void *)sizes);
free((void *)values);
}
}

template <typename T>
std::pair<T *, bool>
ConcurrentMap<T>::insert(std::string_view key, u64 hash, const T &val) {
if (!keys)
return {nullptr, false};

ASSERT(__builtin_popcount(nbuckets) == 1);
i64 idx = hash & (nbuckets - 1);
i64 nretry = 0;

while (nretry < MIN_NBUCKETS) {
const char *ptr = keys[idx];
if (ptr == locked) {
#ifdef __x86_64__
asm volatile("pause" ::: "memory");
#endif
continue;
}

if (ptr == nullptr) {
if (!keys[idx].compare_exchange_strong(ptr, locked))
continue;
new (values + idx) T(val);
sizes[idx] = key.size();
keys[idx] = key.data();
return {values + idx, true};
}

if (key.size() == sizes[idx] && memcmp(ptr, key.data(), sizes[idx]) == 0)
return {values + idx, false};

idx = (idx + 1) & (nbuckets - 1);
nretry++;
}

ASSERT(false && "ConcurrentMap is full");
return {nullptr, false};
}

#define INSTANTIATE(E) \
template class ConcurrentMap<SectionFragment<E>>;

INSTANTIATE(X86_64);
INSTANTIATE(I386);
INSTANTIATE(AARCH64);
21 changes: 21 additions & 0 deletions hyperloglog.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
// This file implements HyperLogLog algorithm, which estimates
// the number of unique items in a given multiset.
//
// For more info, read
// https://engineering.fb.com/2018/12/13/data-infrastructure/hyperloglog

#include "mold.h"

#include <cmath>

i64 HyperLogLog::get_cardinality() const {
double z = 0;
for (i64 val : buckets)
z += pow(2, -val);
return ALPHA * NBUCKETS * NBUCKETS / z;
}

void HyperLogLog::merge(const HyperLogLog &other) {
for (i64 i = 0; i < NBUCKETS; i++)
merge_one(i, other.buckets[i]);
}
7 changes: 7 additions & 0 deletions main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -415,6 +415,13 @@ int do_main(int argc, char **argv) {
if (ctx.objs.empty())
Fatal(ctx) << "no input files";

{
Timer t(ctx, "register_section_pieces");
tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
file->register_section_pieces(ctx);
});
}

// Uniquify shared object files by soname
{
std::unordered_set<std::string_view> seen;
Expand Down
86 changes: 74 additions & 12 deletions mold.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ template <typename E> class ROutputShdr;
template <typename E> class RStrtabSection;
template <typename E> class RSymtabSection;

template <typename T> class ConcurrentMap;

class ZlibCompressor;
class GzipCompressor;
class TarFile;
Expand Down Expand Up @@ -302,6 +304,59 @@ class InputSection {
void report_undef(Context<E> &ctx, Symbol<E> &sym);
};

//
// hyperloglog.cc
//

class HyperLogLog {
public:
HyperLogLog() : buckets(NBUCKETS) {}

void insert(u32 hash) {
merge_one(hash & (NBUCKETS - 1), __builtin_clz(hash) + 1);
}

void merge_one(i64 idx, u8 newval) {
u8 cur = buckets[idx];
while (cur < newval)
if (buckets[idx].compare_exchange_strong(cur, newval))
break;
}

i64 get_cardinality() const;
void merge(const HyperLogLog &other);

private:
static constexpr i64 NBUCKETS = 2048;
static constexpr double ALPHA = 0.79402;

std::vector<std::atomic_uint8_t> buckets;
};

//
// concurrent_map.cc
//

template <typename T>
class ConcurrentMap {
public:
ConcurrentMap();
ConcurrentMap(i64 nbuckets);
~ConcurrentMap();

void resize(i64 nbuckets);
std::pair<T *, bool> insert(std::string_view key, u64 hash, const T &val);

bool has_key(i64 idx) {
return keys[idx];
}

i64 nbuckets = 0;
std::atomic<const char *> *keys = nullptr;
u32 *sizes = nullptr;
T *values = nullptr;
};

//
// output_chunks.cc
//
Expand Down Expand Up @@ -645,27 +700,22 @@ class MergedSection : public OutputChunk<E> {
static MergedSection<E> *
get_instance(Context<E> &ctx, std::string_view name, u64 type, u64 flags);

SectionFragment<E> *insert(std::string_view data, i64 alignment);
void assign_offsets();
SectionFragment<E> *insert(std::string_view data, u64 hash, i64 alignment);
void assign_offsets(Context<E> &ctx);
void copy_buf(Context<E> &ctx) override;
void write_to(Context<E> &ctx, u8 *buf) override;

private:
using MapTy =
tbb::concurrent_unordered_map<std::string_view, SectionFragment<E>>;
HyperLogLog estimator;

private:
static constexpr i64 NUM_SHARDS = 64;

MergedSection(std::string_view name, u64 flags, u32 type)
: OutputChunk<E>(this->SYNTHETIC) {
this->name = name;
this->shdr.sh_flags = flags;
this->shdr.sh_type = type;
}
MergedSection(std::string_view name, u64 flags, u32 type);

MapTy maps[NUM_SHARDS];
ConcurrentMap<SectionFragment<E>> map;
i64 shard_offsets[NUM_SHARDS + 1] = {};
tbb::enumerable_thread_specific<i64> max_alignments;
std::once_flag once_flag;
};

template <typename E>
Expand Down Expand Up @@ -869,6 +919,16 @@ struct ComdatGroup {
std::atomic_uint32_t owner = -1;
};

template <typename E>
struct MergeableSection {
MergedSection<E> *parent;
ElfShdr<E> shdr;
std::vector<std::string_view> strings;
std::vector<u64> hashes;
std::vector<u32> frag_offsets;
std::vector<SectionFragment<E> *> fragments;
};

// InputFile is the base class of ObjectFile and SharedFile.
template <typename E>
class InputFile {
Expand Down Expand Up @@ -911,6 +971,7 @@ class ObjectFile : public InputFile<E> {
static ObjectFile<E> *create_internal_file(Context<E> &ctx);

void parse(Context<E> &ctx);
void register_section_pieces(Context<E> &ctx);
void resolve_lazy_symbols(Context<E> &ctx);
void resolve_regular_symbols(Context<E> &ctx);
void mark_live_objects(Context<E> &ctx,
Expand Down Expand Up @@ -981,6 +1042,7 @@ class ObjectFile : public InputFile<E> {
std::string_view symbol_strtab;
const ElfShdr<E> *symtab_sec;
std::span<u32> symtab_shndx_sec;
std::vector<std::unique_ptr<MergeableSection<E>>> mergeable_sections;
};

// SharedFile represents an input .so file.
Expand Down
Loading

0 comments on commit 41b2fa7

Please sign in to comment.