diff --git a/CMakeLists.txt b/CMakeLists.txt index bfca1c16f67..4241524704c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -681,6 +681,7 @@ set(BASIC_TESTS scm_rights seccomp seccomp_null + seccomp_sigsys_args seccomp_sigsys_sigtrap seccomp_tsync seccomp_veto_exec diff --git a/src/RecordSession.cc b/src/RecordSession.cc index 47968439a39..bb488b29c88 100644 --- a/src/RecordSession.cc +++ b/src/RecordSession.cc @@ -385,13 +385,17 @@ static void seccomp_trap_done(RecordTask* t) { static void handle_seccomp_trap(RecordTask* t, RecordSession::StepState* step_state, uint16_t seccomp_data) { - int syscallno = t->regs().original_syscallno(); - // The architecture may be wrong, but that's ok, because an actual syscall // entry did happen, so the registers are already updated according to the // architecture of the system call. t->canonicalize_and_set_regs(t->regs(), t->detect_syscall_arch()); + Registers r = t->regs(); + int syscallno = r.original_syscallno(); + // Cause kernel processing to skip the syscall + r.set_original_syscallno(SECCOMP_MAGIC_SKIP_ORIGINAL_SYSCALLNO); + t->set_regs(r); + if (t->is_in_untraced_syscall()) { ASSERT(t, !t->delay_syscallbuf_reset); // Don't reset the syscallbuf immediately after delivering the trap. We have @@ -415,8 +419,6 @@ static void handle_seccomp_trap(RecordTask* t, t->record_current_event(); } - Registers r = t->regs(); - // Use NativeArch here because different versions of system headers // have inconsistent field naming. union { @@ -449,8 +451,6 @@ static void handle_seccomp_trap(RecordTask* t, // Tests show that the current registers are preserved (on x86, eax/rax // retains the syscall number). r.set_syscallno(syscallno); - // Cause kernel processing to skip the syscall - r.set_original_syscallno(SECCOMP_MAGIC_SKIP_ORIGINAL_SYSCALLNO); t->set_regs(r); if (t->is_in_untraced_syscall()) { @@ -475,18 +475,19 @@ static void handle_seccomp_trap(RecordTask* t, static void handle_seccomp_errno(RecordTask* t, RecordSession::StepState* step_state, uint16_t seccomp_data) { - int syscallno = t->regs().original_syscallno(); - t->canonicalize_and_set_regs(t->regs(), t->detect_syscall_arch()); + Registers r = t->regs(); + int syscallno = r.original_syscallno(); + // Cause kernel processing to skip the syscall + r.set_original_syscallno(SECCOMP_MAGIC_SKIP_ORIGINAL_SYSCALLNO); + t->set_regs(r); + if (!t->is_in_untraced_syscall()) { t->push_syscall_event(syscallno); note_entering_syscall(t); } - Registers r = t->regs(); - // Cause kernel processing to skip the syscall - r.set_original_syscallno(SECCOMP_MAGIC_SKIP_ORIGINAL_SYSCALLNO); r.set_syscall_result(-seccomp_data); t->set_regs(r); // Don't continue yet. At the next iteration of record_step, if we diff --git a/src/record_syscall.cc b/src/record_syscall.cc index 5c1d935f89b..cb7b7bb0f2c 100644 --- a/src/record_syscall.cc +++ b/src/record_syscall.cc @@ -2740,6 +2740,11 @@ static Switchable rec_prepare_syscall_arch(RecordTask* t, const Registers& regs) { int syscallno = t->ev().Syscall().number; + if (t->regs().original_syscallno() == SECCOMP_MAGIC_SKIP_ORIGINAL_SYSCALLNO) { + // rr vetoed this syscall. Don't do any pre-processing. + return PREVENT_SWITCH; + } + syscall_state.syscall_entry_registers = regs; if (t->desched_rec()) { diff --git a/src/test/seccomp_sigsys_args.c b/src/test/seccomp_sigsys_args.c new file mode 100644 index 00000000000..cde9c2ad5e6 --- /dev/null +++ b/src/test/seccomp_sigsys_args.c @@ -0,0 +1,71 @@ +/* -*- Mode: C; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ + +#include "rrutil.h" + +static void handler(int sig, __attribute__((unused)) siginfo_t* si, void* p) { + ucontext_t* ctx = p; + test_assert(sig == SIGSYS); +#ifdef __i386__ + test_assert(ctx->uc_mcontext.gregs[REG_EBX] == 0); + ctx->uc_mcontext.gregs[REG_EAX] = 42; +#elif defined(__x86_64__) + test_assert(ctx->uc_mcontext.gregs[REG_RDI] == 0); + ctx->uc_mcontext.gregs[REG_RAX] = 42; +#else +#error define architecture here +#endif +} + +static void install_filter(void) { + struct sock_filter filter[] = { + /* Load system call number from 'seccomp_data' buffer into + accumulator */ + BPF_STMT(BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, nr)), + /* Jump forward 1 instruction if system call number + is not SYS_sched_setaffinity */ + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, SYS_sched_setaffinity, 0, 1), + /* Trigger SIGSYS */ + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRAP), + /* Destination of system call number mismatch: allow other + system calls */ + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW) + }; + struct sock_fprog prog = { + .len = (unsigned short)(sizeof(filter) / sizeof(filter[0])), + .filter = filter, + }; + int ret; + + ret = syscall(RR_seccomp, SECCOMP_SET_MODE_FILTER, 0, &prog); + if (ret == -1 && errno == ENOSYS) { + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog); + } + test_assert(ret == 0); +} + +int main(void) { + struct sigaction sa; + sigset_t sigs; + + test_assert(open("/dev/null", O_RDONLY) >= 0); + + sigemptyset(&sigs); + sigaddset(&sigs, SIGTRAP); + sigprocmask(SIG_SETMASK, &sigs, NULL); + + sa.sa_sigaction = handler; + sigemptyset(&sa.sa_mask); + sa.sa_flags = SA_SIGINFO; + sigaction(SIGSYS, &sa, NULL); + + test_assert(0 == prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)); + test_assert(1 == prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0)); + install_filter(); + + /* Test SIGSYS for a syscall rr mucks with */ + test_assert(sched_setaffinity(0, 0, NULL) == 42); + + atomic_puts("EXIT-SUCCESS"); + + return 0; +}