Skip to content

Commit

Permalink
[x86-64] Implement TLSGD to TLSIE relaxation
Browse files Browse the repository at this point in the history
If we know that the .so file we are creating will not be dlopen'ed,
we can relax __tls_get_addr function calls to GOT loads.
  • Loading branch information
rui314 committed Jan 14, 2023
1 parent 983fe0f commit 25d02bb
Show file tree
Hide file tree
Showing 8 changed files with 128 additions and 24 deletions.
2 changes: 2 additions & 0 deletions docs/mold.1
Original file line number Diff line number Diff line change
Expand Up @@ -1198,6 +1198,8 @@ Mark DSO non-deletable at runtime.
.It Fl z Cm nodlopen
Mark DSO not available to
.Xr dlopen 3 .
This option makes it possible for the linker to optimize thread-local \
variable accesses by rewriting instructions for some targets.
.Pp
.It Fl z Cm nodump
Mark DSO not available to
Expand Down
8 changes: 4 additions & 4 deletions elf/arch-i386.cc
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
0xcc, // (padding)
};
memcpy(buf, insn, sizeof(insn));
*(ul32 *)(buf + 5) = sym.get_plt_idx(ctx) * sizeof(ElfRel<E>);
*(ul32 *)(buf + 11) = sym.get_gotplt_addr(ctx) - ctx.got->shdr.sh_addr;
} else {
static const u8 insn[] = {
Expand All @@ -149,10 +150,9 @@ void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
0xcc, // (padding)
};
memcpy(buf, insn, sizeof(insn));
*(ul32 *)(buf + 5) = sym.get_plt_idx(ctx) * sizeof(ElfRel<E>);
*(ul32 *)(buf + 11) = sym.get_gotplt_addr(ctx);
}

*(ul32 *)(buf + 5) = sym.get_plt_idx(ctx) * sizeof(ElfRel<E>);
}

template <>
Expand Down Expand Up @@ -528,7 +528,7 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
ty != R_386_GOT32 && ty != R_386_GOT32X)
Fatal(ctx) << *this << ": TLS_GD reloc must be followed by PLT or GOT32";

if (relax_tlsgd(ctx, sym))
if (ctx.arg.relax && !ctx.arg.shared && !sym.is_imported)
i++;
else
sym.flags.fetch_or(NEEDS_TLSGD, std::memory_order_relaxed);
Expand All @@ -542,7 +542,7 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
ty != R_386_GOT32 && ty != R_386_GOT32X)
Fatal(ctx) << *this << ": TLS_LDM reloc must be followed by PLT or GOT32";

if (relax_tlsld(ctx))
if (ctx.arg.relax && !ctx.arg.shared)
i++;
else
ctx.needs_tlsld.store(true, std::memory_order_relaxed);
Expand Down
5 changes: 3 additions & 2 deletions elf/arch-s390x.cc
Original file line number Diff line number Diff line change
Expand Up @@ -456,12 +456,13 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
break;
case R_390_TLS_GD32:
case R_390_TLS_GD64:
if (!relax_tlsgd(ctx, sym))
if (bool do_relax = ctx.arg.relax && !ctx.arg.shared && !sym.is_imported;
!do_relax)
sym.flags.fetch_or(NEEDS_TLSGD, std::memory_order_relaxed);
break;
case R_390_TLS_LDM32:
case R_390_TLS_LDM64:
if (!relax_tlsld(ctx))
if (bool do_relax = ctx.arg.relax && !ctx.arg.shared; !do_relax)
ctx.needs_tlsld.store(true, std::memory_order_relaxed);
break;
case R_390_TLS_LE32:
Expand Down
51 changes: 43 additions & 8 deletions elf/arch-x86-64.cc
Original file line number Diff line number Diff line change
Expand Up @@ -234,8 +234,37 @@ static void relax_gd_to_le(u8 *loc, ElfRel<E> rel, u64 val) {
}
}

static void relax_gd_to_ie(u8 *loc, ElfRel<E> rel, u64 val) {
switch (rel.r_type) {
case R_X86_64_PLT32:
case R_X86_64_PC32:
case R_X86_64_GOTPCREL:
case R_X86_64_GOTPCRELX: {
static const u8 insn[] = {
0x64, 0x48, 0x8b, 0x04, 0x25, 0, 0, 0, 0, // mov %fs:0, %rax
0x48, 0x03, 0x05, 0, 0, 0, 0, // add foo@gottpoff(%rip), %rax
};
memcpy(loc - 4, insn, sizeof(insn));
*(ul32 *)(loc + 8) = val - 12;
break;
}
case R_X86_64_PLTOFF64: {
static const u8 insn[] = {
0x64, 0x48, 0x8b, 0x04, 0x25, 0, 0, 0, 0, // mov %fs:0, %rax
0x48, 0x03, 0x05, 0, 0, 0, 0, // add foo@gottpoff(%rip), %rax
0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00, // nop
};
memcpy(loc - 3, insn, sizeof(insn));
*(ul32 *)(loc + 9) = val - 13;
break;
}
default:
unreachable();
}
}

// Rewrite a function call to __tls_get_addr to a cheaper instruction
// sequence. The difference from relax_ld_to_le is that we are
// sequence. The difference from relax_gd_to_le is that we are
// materializing a Dynamic Thread Pointer for the current ELF module
// instead of an address for a particular thread-local variable.
static void relax_ld_to_le(u8 *loc, ElfRel<E> rel, u64 val) {
Expand Down Expand Up @@ -416,6 +445,9 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
case R_X86_64_TLSGD:
if (sym.has_tlsgd(ctx)) {
write32s(sym.get_tlsgd_addr(ctx) + A - P);
} else if (sym.has_gottp(ctx)) {
relax_gd_to_ie(loc, rels[i + 1], sym.get_gottp_addr(ctx) - P);
i++;
} else {
relax_gd_to_le(loc, rels[i + 1], S - ctx.tp_addr);
i++;
Expand Down Expand Up @@ -662,7 +694,7 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
if (sym.is_imported)
sym.flags.fetch_or(NEEDS_PLT, std::memory_order_relaxed);
break;
case R_X86_64_TLSGD: {
case R_X86_64_TLSGD:
if (rel.r_addend != -4)
Fatal(ctx) << *this << ": bad r_addend for R_X86_64_TLSGD";

Expand All @@ -675,13 +707,17 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
ty != R_X86_64_GOTPCRELX)
Fatal(ctx) << *this << ": TLSGD reloc must be followed by PLT or GOTPCREL";

if (relax_tlsgd(ctx, sym))
if (ctx.arg.relax && !sym.is_imported && !ctx.arg.shared) {
i++;
else
} else if (ctx.arg.relax && !sym.is_imported && ctx.arg.shared &&
!ctx.arg.z_dlopen) {
sym.flags.fetch_or(NEEDS_GOTTP, std::memory_order_relaxed);
i++;
} else {
sym.flags.fetch_or(NEEDS_TLSGD, std::memory_order_relaxed);
}
break;
}
case R_X86_64_TLSLD: {
case R_X86_64_TLSLD:
if (rel.r_addend != -4)
Fatal(ctx) << *this << ": bad r_addend for R_X86_64_TLSLD";

Expand All @@ -694,12 +730,11 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
ty != R_X86_64_GOTPCRELX)
Fatal(ctx) << *this << ": TLSLD reloc must be followed by PLT or GOTPCREL";

if (relax_tlsld(ctx))
if (ctx.arg.relax && !ctx.arg.shared)
i++;
else
ctx.needs_tlsld.store(true, std::memory_order_relaxed);
break;
}
case R_X86_64_GOTTPOFF: {
if (rel.r_addend != -4)
Fatal(ctx) << *this << ": bad r_addend for R_X86_64_GOTTPOFF";
Expand Down
10 changes: 0 additions & 10 deletions elf/mold.h
Original file line number Diff line number Diff line change
Expand Up @@ -2826,16 +2826,6 @@ inline bool is_c_identifier(std::string_view s) {
return true;
}

template <typename E>
inline bool relax_tlsgd(Context<E> &ctx, Symbol<E> &sym) {
return ctx.arg.relax && !ctx.arg.shared && !sym.is_imported;
}

template <typename E>
inline bool relax_tlsld(Context<E> &ctx) {
return ctx.arg.relax && !ctx.arg.shared;
}

template <typename E>
inline bool relax_tlsdesc(Context<E> &ctx, Symbol<E> &sym) {
// TLSDESC relocs must be always relaxed for statically-linked
Expand Down
38 changes: 38 additions & 0 deletions test/elf/tls-gd-to-ie.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/bin/bash
. $(dirname $0)/common.inc

cat <<EOF | $GCC -fPIC -c -o $t/a.o -xc -
#include <stdio.h>
__attribute__((tls_model("global-dynamic"))) static _Thread_local int x1 = 1;
__attribute__((tls_model("global-dynamic"))) _Thread_local int x2 = 2;
__attribute__((tls_model("global-dynamic"))) _Thread_local int x3;
int foo() {
x3 = 3;
printf("%d %d %d\n", x1, x2, x3);
return 0;
}
EOF

cat <<EOF | $CC -fPIC -c -o $t/b.o -xc -
int foo();
int main() { foo(); }
EOF

$CC -B. -shared -o $t/c.so $t/a.o
$CC -B. -o $t/exe1 $t/b.o $t/c.so
$QEMU $t/exe1 | grep -q '1 2 3'

$CC -B. -shared -o $t/d.so $t/a.o -Wl,-no-relax
$CC -B. -o $t/exe2 $t/b.o $t/d.so
$QEMU $t/exe2 | grep -q '1 2 3'

$CC -B. -shared -o $t/e.so $t/a.o -Wl,-z,nodlopen
$CC -B. -o $t/exe3 $t/b.o $t/e.so
$QEMU $t/exe3 | grep -q '1 2 3'

$CC -B. -shared -o $t/f.so $t/a.o -Wl,-z,nodlopen -Wl,-no-relax
$CC -B. -o $t/exe4 $t/b.o $t/f.so
$QEMU $t/exe4 | grep -q '1 2 3'
Empty file modified test/elf/x86_64_ifunc-alias.sh
100644 → 100755
Empty file.
38 changes: 38 additions & 0 deletions test/elf/x86_64_tls-gd-to-ie.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/bin/bash
. $(dirname $0)/common.inc

cat <<EOF | $GCC -fPIC -c -o $t/a.o -xc - -mcmodel=large
#include <stdio.h>
__attribute__((tls_model("global-dynamic"))) static _Thread_local int x1 = 1;
__attribute__((tls_model("global-dynamic"))) _Thread_local int x2 = 2;
__attribute__((tls_model("global-dynamic"))) _Thread_local int x3;
int foo() {
x3 = 3;
printf("%d %d %d\n", x1, x2, x3);
return 0;
}
EOF

cat <<EOF | $CC -fPIC -c -o $t/b.o -xc -
int foo();
int main() { foo(); }
EOF

$CC -B. -shared -o $t/c.so $t/a.o
$CC -B. -o $t/exe1 $t/b.o $t/c.so
$QEMU $t/exe1 | grep -q '1 2 3'

$CC -B. -shared -o $t/d.so $t/a.o -Wl,-no-relax
$CC -B. -o $t/exe2 $t/b.o $t/d.so
$QEMU $t/exe2 | grep -q '1 2 3'

$CC -B. -shared -o $t/e.so $t/a.o -Wl,-z,nodlopen
$CC -B. -o $t/exe3 $t/b.o $t/e.so
$QEMU $t/exe3 | grep -q '1 2 3'

$CC -B. -shared -o $t/f.so $t/a.o -Wl,-z,nodlopen -Wl,-no-relax
$CC -B. -o $t/exe4 $t/b.o $t/f.so
$QEMU $t/exe4 | grep -q '1 2 3'

0 comments on commit 25d02bb

Please sign in to comment.