Skip to content

Commit

Permalink
More changes to cope with 5.16 kernel change for coredumping signals …
Browse files Browse the repository at this point in the history
…with shared vms but different thread groups and both the old and new behaviors.

This also changes the cleartid behavior for coredumping signals. Code and comments in rr claim that the kernel does not clear the tid futex when a process takes a coredumping signal but experimentation shows that it does at least as early as 5.11. It's possible that this behavior changed in the kernel and we didn't notice. What rr does here probably is not that important as long as it matches in record and replay, but it's nice to have our test binaries run ok outside of rr, so I've updated rr to match the latest kernel here too.
  • Loading branch information
khuey committed Nov 20, 2021
1 parent 04bbacd commit 1a3b389
Show file tree
Hide file tree
Showing 6 changed files with 65 additions and 15 deletions.
10 changes: 6 additions & 4 deletions src/RecordSession.cc
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ static bool handle_ptrace_exit_event(RecordTask* t) {
bool may_wait_exit = !is_coredumping_signal(exit_status.fatal_sig()) &&
!t->waiting_for_pid_namespace_tasks_to_exit();
record_exit_trace_event(t, exit_status);
t->record_exit_event(exit_status.fatal_sig(),
t->record_exit_event(
(!t->already_reaped() && !may_wait_exit) ? RecordTask::WRITE_CHILD_TID : RecordTask::KERNEL_WRITES_CHILD_TID);
if (!t->already_reaped()) {
t->proceed_to_exit(may_wait_exit);
Expand Down Expand Up @@ -1648,7 +1648,7 @@ bool RecordSession::signal_state_changed(RecordTask* t, StepState* step_state) {
WaitStatus exit_status = WaitStatus::for_fatal_sig(sig);
record_exit_trace_event(t, exit_status);
// Allow writing child_tid now because otherwise the write will race
t->record_exit_event(sig, RecordTask::WRITE_CHILD_TID);
t->record_exit_event(RecordTask::WRITE_CHILD_TID);
// On a real affected kernel, we probably would have never gotten here,
// since the signal we would be seeing was not deterministic, but let's
// be conservative and still try to emulate the ptrace stop.
Expand Down Expand Up @@ -1683,11 +1683,13 @@ bool RecordSession::signal_state_changed(RecordTask* t, StepState* step_state) {
}

// Mark each task in this address space as expecting a ptrace exit
// to avoid causing any ptrace_exit reaces.
// to avoid causing any ptrace_exit races.
if (is_fatal && is_coredumping_signal(sig)) {
for (Task *ot : t->vm()->task_set()) {
if (t != ot) {
((RecordTask *)ot)->waiting_for_ptrace_exit = true;
if (t->tgid() == ot->tgid() || coredumping_signal_takes_down_entire_vm()) {
((RecordTask *)ot)->waiting_for_ptrace_exit = true;
}
}
}
}
Expand Down
8 changes: 2 additions & 6 deletions src/RecordTask.cc
Original file line number Diff line number Diff line change
Expand Up @@ -253,15 +253,11 @@ RecordTask::~RecordTask() {
}
}

void RecordTask::record_exit_event(int fatal_signo, WriteChildTid write_child_tid) {
void RecordTask::record_exit_event(WriteChildTid write_child_tid) {
// The kernel explicitly only clears the futex if the address space is shared.
// If the address space has no other users then the futex will not be cleared
// even if it lives in shared memory which other tasks can read.
// If however, the exit was the result of a fatal, core-dump signal, the futex
// is not cleared (both to preserve the coredump and because any other users
// of the same address space were also shot down).
if (!is_coredumping_signal(fatal_signo) &&
!tid_futex.is_null() && as->task_set().size() > 1 &&
if (!tid_futex.is_null() && as->task_set().size() > 1 &&
as->has_mapping(tid_futex)) {
int val = 0;
record_local(tid_futex, &val);
Expand Down
2 changes: 1 addition & 1 deletion src/RecordTask.h
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ class RecordTask : public Task {
KERNEL_WRITES_CHILD_TID,
WRITE_CHILD_TID,
};
void record_exit_event(int exitsig = 0, WriteChildTid write_child_tid = KERNEL_WRITES_CHILD_TID);
void record_exit_event(WriteChildTid write_child_tid = KERNEL_WRITES_CHILD_TID);
/**
* Called when we're about to deliver a signal to this task. If it's a
* synthetic SIGCHLD and there's a ptraced task that needs to SIGCHLD,
Expand Down
14 changes: 10 additions & 4 deletions src/test/clone_cleartid_coredump.c
Original file line number Diff line number Diff line change
Expand Up @@ -51,16 +51,22 @@ int main(void) {
NULL, shared_page, NULL, shared_page);
test_assert(tid > 0);

futex(&child_tid, FUTEX_WAIT, tid, NULL, NULL, 0);
test_assert(tid = waitpid(tid, &status, __WALL));
test_assert(WIFSIGNALED(status));
return 0;
}

test_assert(child_tid > 0);
test_assert(child_tid == waitpid(child_tid, &status, __WALL));
test_assert(WIFSIGNALED(status) && WTERMSIG(status) == SIGSEGV);
test_assert(*(pid_t*)shared_page != (pid_t)-1 &&
*(pid_t*)shared_page == 0);
if (WIFSIGNALED(status) && WTERMSIG(status) == SIGSEGV) {
atomic_puts("Old (<5.16) kernel behavior");
test_assert(*(pid_t*)shared_page == 0);
} else if (WIFEXITED(status) && WEXITSTATUS(status) == 0) {
atomic_puts("New (5.16+) kernel behavior");
test_assert(*(pid_t*)shared_page == 0);
} else {
test_assert(0);
}

atomic_puts("EXIT-SUCCESS");
return 0;
Expand Down
44 changes: 44 additions & 0 deletions src/util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2357,5 +2357,49 @@ void SAFE_FATAL(int err, const char *msg)
abort();
}

static int child_SIGSEGV(__attribute__((unused)) void* arg) {
kill(getpid(), SIGSEGV);
return 0;
}

bool coredumping_signal_takes_down_entire_vm() {
// The kernel behavior here changed in 5.16. Prior to that,
// a coredumping signal would bring down the entire vm.
// Starting with 5.16 it only brings down the thread group.
// -1 here indicates uninitialized, 0 the new behavior, and
// 1 the old behavior.
static int coredumping_signal_vm_behavior = -1;
if (coredumping_signal_vm_behavior < 0) {
pid_t child = -1;
int status = -1;
LOG(debug) << "Testing coredumping behavior in the presence of CLONE_VM";
if ((child = fork()) == 0) {
// Remove any handlers and make sure we get the default signal behavior.
signal(SIGSEGV, SIG_DFL);
// Don't litter the system with core dumps.
prctl(PR_SET_DUMPABLE, 0);
// Allocate a stack for the child.
const size_t stack_size = 1 << 20;
void* stack = mmap(NULL, stack_size, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);

pid_t tid = clone(child_SIGSEGV, (void*)((uintptr_t)stack + stack_size),
CLONE_VM, NULL, NULL, NULL, NULL);
DEBUG_ASSERT(tid > 0);

pid_t ret = waitpid(tid, &status, __WALL);
DEBUG_ASSERT(ret == tid);
DEBUG_ASSERT(WIFSIGNALED(status));
exit(0);
}

DEBUG_ASSERT(child > 0);
pid_t ret = waitpid(child, &status, __WALL);
DEBUG_ASSERT(ret == child);
coredumping_signal_vm_behavior = WIFSIGNALED(status);
}

return coredumping_signal_vm_behavior > 0;
}

} // namespace rr
2 changes: 2 additions & 0 deletions src/util.h
Original file line number Diff line number Diff line change
Expand Up @@ -558,6 +558,8 @@ int pop_count(uint64_t v);
safe to use in volatile contexts */
void SAFE_FATAL(int err, const char *msg);

bool coredumping_signal_takes_down_entire_vm();

} // namespace rr

#endif /* RR_UTIL_H_ */

0 comments on commit 1a3b389

Please sign in to comment.