diff --git a/Makefile b/Makefile index c219ca3f05..1b369438c0 100644 --- a/Makefile +++ b/Makefile @@ -8,11 +8,12 @@ CPPFLAGS = -g -Imimalloc/include -pthread -std=c++20 \ -DGIT_HASH=\"$(GIT_HASH)\" \ $(EXTRA_CPPFLAGS) LDFLAGS += $(EXTRA_LDFLAGS) -rdynamic -LIBS = -Wl,-as-needed -lcrypto -pthread -lz -lxxhash -ldl +LIBS = -Wl,-as-needed -lcrypto -pthread -lz -lxxhash -ldl -lm OBJS = main.o object_file.o input_sections.o output_chunks.o \ mapfile.o perf.o linker_script.o archive_file.o output_file.o \ subprocess.o gc_sections.o icf.o symbols.o cmdline.o filepath.o \ passes.o tar.o compress.o memory_mapped_file.o relocatable.o \ + concurrent_map.o hyperloglog.o \ arch_x86_64.o arch_i386.o arch_aarch64.o PREFIX ?= /usr diff --git a/concurrent_map.cc b/concurrent_map.cc new file mode 100644 index 0000000000..ee59270026 --- /dev/null +++ b/concurrent_map.cc @@ -0,0 +1,80 @@ +#include "mold.h" + +static const char *locked = (char *)-1; + +static constexpr i64 MIN_NBUCKETS = 256; + +template +ConcurrentMap::ConcurrentMap() {} + +template +ConcurrentMap::ConcurrentMap(i64 nbuckets) { + resize(nbuckets); +} + +template +void ConcurrentMap::resize(i64 nbuckets) { + this->~ConcurrentMap(); + + nbuckets = std::max(MIN_NBUCKETS, next_power_of_two(nbuckets)); + + this->nbuckets = nbuckets; + keys = (std::atomic *)calloc(nbuckets, sizeof(keys[0])); + sizes = (u32 *)calloc(nbuckets, sizeof(sizes[0])); + values = (T *)calloc(nbuckets, sizeof(values[0])); +} + +template +ConcurrentMap::~ConcurrentMap() { + if (keys) { + free((void *)keys); + free((void *)sizes); + free((void *)values); + } +} + +template +std::pair +ConcurrentMap::insert(std::string_view key, u64 hash, const T &val) { + if (!keys) + return {nullptr, false}; + + ASSERT(__builtin_popcount(nbuckets) == 1); + i64 idx = hash & (nbuckets - 1); + i64 nretry = 0; + + while (nretry < MIN_NBUCKETS) { + const char *ptr = keys[idx]; + if (ptr == locked) { +#ifdef __x86_64__ + asm volatile("pause" ::: "memory"); +#endif + continue; + } + + if (ptr == nullptr) { + if (!keys[idx].compare_exchange_strong(ptr, locked)) + continue; + new (values + idx) T(val); + sizes[idx] = key.size(); + keys[idx] = key.data(); + return {values + idx, true}; + } + + if (key.size() == sizes[idx] && memcmp(ptr, key.data(), sizes[idx]) == 0) + return {values + idx, false}; + + idx = (idx + 1) & (nbuckets - 1); + nretry++; + } + + ASSERT(false && "ConcurrentMap is full"); + return {nullptr, false}; +} + +#define INSTANTIATE(E) \ + template class ConcurrentMap>; + +INSTANTIATE(X86_64); +INSTANTIATE(I386); +INSTANTIATE(AARCH64); diff --git a/hyperloglog.cc b/hyperloglog.cc new file mode 100644 index 0000000000..6cff1d7074 --- /dev/null +++ b/hyperloglog.cc @@ -0,0 +1,21 @@ +// This file implements HyperLogLog algorithm, which estimates +// the number of unique items in a given multiset. +// +// For more info, read +// https://engineering.fb.com/2018/12/13/data-infrastructure/hyperloglog + +#include "mold.h" + +#include + +i64 HyperLogLog::get_cardinality() const { + double z = 0; + for (i64 val : buckets) + z += pow(2, -val); + return ALPHA * NBUCKETS * NBUCKETS / z; +} + +void HyperLogLog::merge(const HyperLogLog &other) { + for (i64 i = 0; i < NBUCKETS; i++) + merge_one(i, other.buckets[i]); +} diff --git a/main.cc b/main.cc index 1ffa10be8b..60e54809a5 100644 --- a/main.cc +++ b/main.cc @@ -415,6 +415,13 @@ int do_main(int argc, char **argv) { if (ctx.objs.empty()) Fatal(ctx) << "no input files"; + { + Timer t(ctx, "register_section_pieces"); + tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { + file->register_section_pieces(ctx); + }); + } + // Uniquify shared object files by soname { std::unordered_set seen; diff --git a/mold.h b/mold.h index b577e78c8a..550545cadf 100644 --- a/mold.h +++ b/mold.h @@ -63,6 +63,8 @@ template class ROutputShdr; template class RStrtabSection; template class RSymtabSection; +template class ConcurrentMap; + class ZlibCompressor; class GzipCompressor; class TarFile; @@ -302,6 +304,59 @@ class InputSection { void report_undef(Context &ctx, Symbol &sym); }; +// +// hyperloglog.cc +// + +class HyperLogLog { +public: + HyperLogLog() : buckets(NBUCKETS) {} + + void insert(u32 hash) { + merge_one(hash & (NBUCKETS - 1), __builtin_clz(hash) + 1); + } + + void merge_one(i64 idx, u8 newval) { + u8 cur = buckets[idx]; + while (cur < newval) + if (buckets[idx].compare_exchange_strong(cur, newval)) + break; + } + + i64 get_cardinality() const; + void merge(const HyperLogLog &other); + +private: + static constexpr i64 NBUCKETS = 2048; + static constexpr double ALPHA = 0.79402; + + std::vector buckets; +}; + +// +// concurrent_map.cc +// + +template +class ConcurrentMap { +public: + ConcurrentMap(); + ConcurrentMap(i64 nbuckets); + ~ConcurrentMap(); + + void resize(i64 nbuckets); + std::pair insert(std::string_view key, u64 hash, const T &val); + + bool has_key(i64 idx) { + return keys[idx]; + } + + i64 nbuckets = 0; + std::atomic *keys = nullptr; + u32 *sizes = nullptr; + T *values = nullptr; +}; + // // output_chunks.cc // @@ -645,27 +700,22 @@ class MergedSection : public OutputChunk { static MergedSection * get_instance(Context &ctx, std::string_view name, u64 type, u64 flags); - SectionFragment *insert(std::string_view data, i64 alignment); - void assign_offsets(); + SectionFragment *insert(std::string_view data, u64 hash, i64 alignment); + void assign_offsets(Context &ctx); void copy_buf(Context &ctx) override; void write_to(Context &ctx, u8 *buf) override; -private: - using MapTy = - tbb::concurrent_unordered_map>; + HyperLogLog estimator; +private: static constexpr i64 NUM_SHARDS = 64; - MergedSection(std::string_view name, u64 flags, u32 type) - : OutputChunk(this->SYNTHETIC) { - this->name = name; - this->shdr.sh_flags = flags; - this->shdr.sh_type = type; - } + MergedSection(std::string_view name, u64 flags, u32 type); - MapTy maps[NUM_SHARDS]; + ConcurrentMap> map; i64 shard_offsets[NUM_SHARDS + 1] = {}; tbb::enumerable_thread_specific max_alignments; + std::once_flag once_flag; }; template @@ -869,6 +919,16 @@ struct ComdatGroup { std::atomic_uint32_t owner = -1; }; +template +struct MergeableSection { + MergedSection *parent; + ElfShdr shdr; + std::vector strings; + std::vector hashes; + std::vector frag_offsets; + std::vector *> fragments; +}; + // InputFile is the base class of ObjectFile and SharedFile. template class InputFile { @@ -911,6 +971,7 @@ class ObjectFile : public InputFile { static ObjectFile *create_internal_file(Context &ctx); void parse(Context &ctx); + void register_section_pieces(Context &ctx); void resolve_lazy_symbols(Context &ctx); void resolve_regular_symbols(Context &ctx); void mark_live_objects(Context &ctx, @@ -981,6 +1042,7 @@ class ObjectFile : public InputFile { std::string_view symbol_strtab; const ElfShdr *symtab_sec; std::span symtab_shndx_sec; + std::vector>> mergeable_sections; }; // SharedFile represents an input .so file. diff --git a/object_file.cc b/object_file.cc index 9fdca0cab2..b353a154c3 100644 --- a/object_file.cc +++ b/object_file.cc @@ -510,12 +510,6 @@ void ObjectFile::initialize_symbols(Context &ctx) { } } -template -struct MergeableSection { - std::vector *> fragments; - std::vector frag_offsets; -}; - static size_t find_null(std::string_view data, u64 entsize) { if (entsize == 1) return data.find('\0'); @@ -545,17 +539,17 @@ static size_t find_null(std::string_view data, u64 entsize) { // // We do not support mergeable sections that have relocations. template -static MergeableSection +static std::unique_ptr> split_section(Context &ctx, InputSection &sec) { - MergeableSection rec; - - MergedSection *parent = - MergedSection::get_instance(ctx, sec.name(), sec.shdr.sh_type, - sec.shdr.sh_flags); + std::unique_ptr> rec(new MergeableSection); + rec->parent = MergedSection::get_instance(ctx, sec.name(), sec.shdr.sh_type, + sec.shdr.sh_flags); + rec->shdr = sec.shdr; std::string_view data = sec.contents; const char *begin = data.data(); u64 entsize = sec.shdr.sh_entsize; + HyperLogLog estimator; static_assert(sizeof(SectionFragment::alignment) == 2); if (sec.shdr.sh_addralign >= UINT16_MAX) @@ -570,9 +564,12 @@ split_section(Context &ctx, InputSection &sec) { std::string_view substr = data.substr(0, end + entsize); data = data.substr(end + entsize); - SectionFragment *frag = parent->insert(substr, sec.shdr.sh_addralign); - rec.fragments.push_back(frag); - rec.frag_offsets.push_back(substr.data() - begin); + rec->strings.push_back(substr); + rec->frag_offsets.push_back(substr.data() - begin); + + u64 hash = hash_string(substr); + rec->hashes.push_back(hash); + estimator.insert(hash); } } else { if (data.size() % entsize) @@ -582,15 +579,19 @@ split_section(Context &ctx, InputSection &sec) { std::string_view substr = data.substr(0, entsize); data = data.substr(entsize); - SectionFragment *frag = parent->insert(substr, sec.shdr.sh_addralign); - rec.fragments.push_back(frag); - rec.frag_offsets.push_back(substr.data() - begin); + rec->strings.push_back(substr); + rec->frag_offsets.push_back(substr.data() - begin); + + u64 hash = hash_string(substr); + rec->hashes.push_back(hash); + estimator.insert(hash); } } - static Counter counter("string_fragments"); - counter += rec.fragments.size(); + rec->parent->estimator.merge(estimator); + static Counter counter("string_fragments"); + counter += rec->fragments.size(); return rec; } @@ -638,7 +639,7 @@ split_section(Context &ctx, InputSection &sec) { // is attached to the symbol. template void ObjectFile::initialize_mergeable_sections(Context &ctx) { - std::vector> mergeable_sections(sections.size()); + mergeable_sections.resize(sections.size()); for (i64 i = 0; i < sections.size(); i++) { std::unique_ptr> &isec = sections[i]; @@ -648,6 +649,15 @@ void ObjectFile::initialize_mergeable_sections(Context &ctx) { isec->is_alive = false; } } +} + +template +void ObjectFile::register_section_pieces(Context &ctx) { + for (std::unique_ptr> &m : mergeable_sections) + if (m) + for (i64 i = 0; i < m->strings.size(); i++) + m->fragments.push_back(m->parent->insert(m->strings[i], m->hashes[i], + m->shdr.sh_addralign)); // Initialize rel_fragments for (std::unique_ptr> &isec : sections) { @@ -663,13 +673,10 @@ void ObjectFile::initialize_mergeable_sections(Context &ctx) { for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; const ElfSym &esym = elf_syms[rel.r_sym]; - - if (esym.st_type == STT_SECTION) { - MergeableSection &m = mergeable_sections[get_shndx(esym)]; - if (!m.fragments.empty()) - len++; - } + if (esym.st_type == STT_SECTION && mergeable_sections[get_shndx(esym)]) + len++; } + if (len == 0) continue; @@ -683,19 +690,20 @@ void ObjectFile::initialize_mergeable_sections(Context &ctx) { if (esym.st_type != STT_SECTION) continue; - MergeableSection &m = mergeable_sections[get_shndx(esym)]; - if (m.fragments.empty()) + std::unique_ptr> &m = + mergeable_sections[get_shndx(esym)]; + if (!m) continue; i64 offset = esym.st_value + isec->get_addend(rel); - std::span offsets = m.frag_offsets; + std::span offsets = m->frag_offsets; auto it = std::upper_bound(offsets.begin(), offsets.end(), offset); if (it == offsets.begin()) Fatal(ctx) << *this << ": bad relocation at " << rel.r_sym; i64 idx = it - 1 - offsets.begin(); - isec->rel_fragments[frag_idx++] = {m.fragments[idx], (i32)i, + isec->rel_fragments[frag_idx++] = {m->fragments[idx], (i32)i, (i32)(offset - offsets[idx])}; } @@ -708,11 +716,12 @@ void ObjectFile::initialize_mergeable_sections(Context &ctx) { if (esym.is_abs() || esym.is_common()) continue; - MergeableSection &m = mergeable_sections[get_shndx(esym)]; - if (m.fragments.empty()) + std::unique_ptr> &m = + mergeable_sections[get_shndx(esym)]; + if (!m) continue; - std::span offsets = m.frag_offsets; + std::span offsets = m->frag_offsets; auto it = std::upper_bound(offsets.begin(), offsets.end(), esym.st_value); if (it == offsets.begin()) @@ -722,12 +731,13 @@ void ObjectFile::initialize_mergeable_sections(Context &ctx) { if (i < first_global) this->symbols[i]->value = esym.st_value - offsets[idx]; - sym_fragments[i].frag = m.fragments[idx]; + sym_fragments[i].frag = m->fragments[idx]; sym_fragments[i].addend = esym.st_value - offsets[idx]; } - for (MergeableSection &m : mergeable_sections) - fragments.insert(fragments.end(), m.fragments.begin(), m.fragments.end()); + for (std::unique_ptr> &m : mergeable_sections) + if (m) + fragments.insert(fragments.end(), m->fragments.begin(), m->fragments.end()); } template diff --git a/output_chunks.cc b/output_chunks.cc index b9825b99e5..11cc5f6607 100644 --- a/output_chunks.cc +++ b/output_chunks.cc @@ -1115,6 +1115,14 @@ void GnuHashSection::copy_buf(Context &ctx) { } } +template +MergedSection::MergedSection(std::string_view name, u64 flags, u32 type) + : OutputChunk(this->SYNTHETIC) { + this->name = name; + this->shdr.sh_flags = flags; + this->shdr.sh_type = type; +} + template MergedSection * MergedSection::get_instance(Context &ctx, std::string_view name, @@ -1150,78 +1158,63 @@ MergedSection::get_instance(Context &ctx, std::string_view name, template SectionFragment * -MergedSection::insert(std::string_view data, i64 alignment) { +MergedSection::insert(std::string_view data, u64 hash, i64 alignment) { ASSERT(alignment < UINT16_MAX); - std::string_view suffix = data; - if (suffix.size() > 32) - suffix = suffix.substr(suffix.size() - 32); - i64 shard = hash_string(suffix) % NUM_SHARDS; + std::call_once(once_flag, [&]() { + // We aim 2/3 occupation ratio + map.resize(estimator.get_cardinality() * 3 / 2); + }); SectionFragment *frag; - { - auto [it, inserted] = - maps[shard].insert(std::pair(data, SectionFragment(this, data))); - frag = &it->second; - } + bool inserted; + std::tie(frag, inserted) = map.insert(data, hash, SectionFragment(this, data)); + ASSERT(frag); for (u16 cur = frag->alignment; cur < alignment;) if (frag->alignment.compare_exchange_strong(cur, alignment)) break; - - max_alignments.local() = std::max(max_alignments.local(), alignment); return frag; } template -void MergedSection::assign_offsets() { - std::vector *> fragments[NUM_SHARDS]; - i64 sizes[NUM_SHARDS] = {}; - - tbb::parallel_for((i64)0, NUM_SHARDS, [&](i64 i) { - for (auto it = maps[i].begin(); it != maps[i].end(); it++) - if (SectionFragment &frag = it->second; frag.is_alive) - fragments[i].push_back(&frag); +void MergedSection::assign_offsets(Context &ctx) { + std::vector *> fragments(map.nbuckets); + for (i64 i = 0; i < map.nbuckets; i++) + fragments[i] = map.values + i; - // Sort section fragments to make an output deterministic. - std::sort(fragments[i].begin(), fragments[i].end(), - [&](SectionFragment *a, SectionFragment *b) { - if (a->alignment != b->alignment) - return a->alignment > b->alignment; - if (a->data.size() != b->data.size()) - return a->data.size() < b->data.size(); - return a->data < b->data; - }); - - i64 offset = 0; - for (SectionFragment *frag : fragments[i]) { - offset = align_to(offset, frag->alignment); - frag->offset = offset; - offset += frag->data.size(); - } - - sizes[i] = offset; + // Sort fragments to make output deterministic. + tbb::parallel_sort(fragments.begin(), fragments.end(), + [](SectionFragment *a, SectionFragment *b) { + if (!a->is_alive || !b->is_alive) + return a->is_alive && !b->is_alive; + if (a->alignment != b->alignment) + return a->alignment < b->alignment; + if (a->data.size() != b->data.size()) + return a->data.size() < b->data.size(); + return a->data < b->data; }); - i64 alignment = 1; - for (i64 x : max_alignments) - alignment = std::max(alignment, x); - - for (i64 i = 1; i < NUM_SHARDS + 1; i++) - shard_offsets[i] = - align_to(shard_offsets[i - 1] + sizes[i - 1], alignment); - - tbb::parallel_for((i64)1, NUM_SHARDS, [&](i64 i) { - for (SectionFragment *frag : fragments[i]) - frag->offset += shard_offsets[i]; + // Remove dead fragments. + auto mid = std::partition_point(fragments.begin(), fragments.end(), + [](SectionFragment *frag) -> bool { + return frag->is_alive; }); + fragments.resize(mid - fragments.begin()); - this->shdr.sh_size = shard_offsets[NUM_SHARDS]; - this->shdr.sh_addralign = alignment; + // Assign offsets. + i64 offset = 0; + for (SectionFragment *frag : fragments) { + offset = align_to(offset, frag->alignment); + frag->offset = offset; + offset += frag->data.size(); + this->shdr.sh_addralign = + std::max(this->shdr.sh_addralign, frag->alignment); + } + this->shdr.sh_size = offset; static Counter merged_strings("merged_strings"); - for (std::span *> span : fragments) - merged_strings += span.size(); + merged_strings += fragments.size(); } template @@ -1231,11 +1224,12 @@ void MergedSection::copy_buf(Context &ctx) { template void MergedSection::write_to(Context &ctx, u8 *buf) { - tbb::parallel_for((i64)0, NUM_SHARDS, [&](i64 i) { - memset(buf + shard_offsets[i], 0, shard_offsets[i + 1] - shard_offsets[i]); - for (auto it = maps[i].begin(); it != maps[i].end(); it++) - if (SectionFragment &frag = it->second; frag.is_alive) - memcpy(buf + frag.offset, frag.data.data(), frag.data.size()); + memset(buf, 0, this->shdr.sh_size); + + tbb::parallel_for_each(map.values, map.values + map.nbuckets, + [&](SectionFragment &frag) { + if (frag.is_alive) + memcpy(buf + frag.offset, frag.data.data(), frag.data.size()); }); } diff --git a/passes.cc b/passes.cc index 168e1a3f29..287ebf854c 100644 --- a/passes.cc +++ b/passes.cc @@ -198,7 +198,8 @@ void add_comment_string(Context &ctx, std::string str) { std::string_view buf = save_string(ctx, str); MergedSection *sec = MergedSection::get_instance(ctx, ".comment", SHT_PROGBITS, 0); - SectionFragment *frag = sec->insert({buf.data(), buf.size() + 1}, 1); + std::string_view data(buf.data(), buf.size() + 1); + SectionFragment *frag = sec->insert(data, hash_string(data), 1); frag->is_alive = true; } @@ -221,9 +222,10 @@ void compute_merged_section_sizes(Context &ctx) { if (char *env = getenv("MOLD_DEBUG"); env && env[0]) add_comment_string(ctx, "mold command line: " + get_cmdline_args(ctx)); + Timer t2(ctx, "MergedSection assign_offsets"); tbb::parallel_for_each(ctx.merged_sections, - [](std::unique_ptr> &sec) { - sec->assign_offsets(); + [&](std::unique_ptr> &sec) { + sec->assign_offsets(ctx); }); }