/
RecordTask.cc
1852 lines (1654 loc) · 60.9 KB
/
RecordTask.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */
#include "RecordTask.h"
#include <dirent.h>
#include <elf.h>
#include <limits.h>
#include <linux/perf_event.h>
#include <sys/prctl.h>
#include <sys/syscall.h>
#include "AutoRemoteSyscalls.h"
#include "PreserveFileMonitor.h"
#include "RecordSession.h"
#include "core.h"
#include "kernel_abi.h"
#include "kernel_metadata.h"
#include "log.h"
#include "record_signal.h"
#include "rr/rr.h"
#include "util.h"
using namespace std;
namespace rr {
/**
* Stores the table of signal dispositions and metadata for an
* arbitrary set of tasks. Each of those tasks must own one one of
* the |refcount|s while they still refer to this.
*/
struct Sighandler {
Sighandler() : resethand(false), takes_siginfo(false) {}
template <typename Arch>
void init_arch(const typename Arch::kernel_sigaction& ksa) {
k_sa_handler = ksa.k_sa_handler;
sa.resize(sizeof(ksa));
memcpy(sa.data(), &ksa, sizeof(ksa));
resethand = (ksa.sa_flags & SA_RESETHAND) != 0;
takes_siginfo = (ksa.sa_flags & SA_SIGINFO) != 0;
}
template <typename Arch> void reset_arch() {
typename Arch::kernel_sigaction ksa;
memset(&ksa, 0, sizeof(ksa));
DEBUG_ASSERT(uintptr_t(SIG_DFL) == 0);
init_arch<Arch>(ksa);
}
SignalDisposition disposition() const {
DEBUG_ASSERT(uintptr_t(SIG_DFL) == 0);
DEBUG_ASSERT(uintptr_t(SIG_IGN) == 1);
switch (k_sa_handler.as_int()) {
case 0:
return SIGNAL_DEFAULT;
case 1:
return SIGNAL_IGNORE;
default:
return SIGNAL_HANDLER;
}
}
remote_code_ptr get_user_handler() const {
return disposition() == SIGNAL_HANDLER
? remote_code_ptr(k_sa_handler.as_int())
: remote_code_ptr();
}
remote_ptr<void> k_sa_handler;
// Saved kernel_sigaction; used to restore handler
vector<uint8_t> sa;
bool resethand;
bool takes_siginfo;
};
static void reset_handler(Sighandler* handler, SupportedArch arch) {
RR_ARCH_FUNCTION(handler->reset_arch, arch);
}
struct Sighandlers {
typedef shared_ptr<Sighandlers> shr_ptr;
shr_ptr clone() const {
shr_ptr s(new Sighandlers());
// NB: depends on the fact that Sighandler is for all
// intents and purposes a POD type, though not
// technically.
for (size_t i = 0; i < array_length(handlers); ++i) {
s->handlers[i] = handlers[i];
}
return s;
}
Sighandler& get(int sig) {
assert_valid(sig);
return handlers[sig];
}
const Sighandler& get(int sig) const {
assert_valid(sig);
return handlers[sig];
}
void init_from_current_process() {
for (size_t i = 1; i < array_length(handlers); ++i) {
Sighandler& h = handlers[i];
NativeArch::kernel_sigaction sa;
if (::syscall(SYS_rt_sigaction, i, nullptr, &sa, sizeof(uint64_t))) {
/* EINVAL means we're querying an
* unused signal number. */
DEBUG_ASSERT(EINVAL == errno);
continue;
}
msan_unpoison(&sa, sizeof(NativeArch::kernel_sigaction));
h.init_arch<NativeArch>(sa);
}
}
/**
* For each signal in |table| such that is_user_handler() is
* true, reset the disposition of that signal to SIG_DFL, and
* clear the resethand flag if it's set. SIG_IGN signals are
* not modified.
*
* (After an exec() call copies the original sighandler table,
* this is the operation required by POSIX to initialize that
* table copy.)
*/
void reset_user_handlers(SupportedArch arch) {
for (int i = 0; i < ssize_t(array_length(handlers)); ++i) {
Sighandler& h = handlers[i];
// If the handler was a user handler, reset to
// default. If it was SIG_IGN or SIG_DFL,
// leave it alone.
if (h.disposition() == SIGNAL_HANDLER) {
reset_handler(&h, arch);
}
}
}
void assert_valid(int sig) const {
DEBUG_ASSERT(0 < sig && sig < ssize_t(array_length(handlers)));
}
static shr_ptr create() { return shr_ptr(new Sighandlers()); }
Sighandler handlers[_NSIG];
private:
Sighandlers() {}
Sighandlers(const Sighandlers&);
Sighandlers operator=(const Sighandlers&);
};
RecordTask::RecordTask(RecordSession& session, pid_t _tid, uint32_t serial,
SupportedArch a)
: Task(session, _tid, _tid, serial, a),
ticks_at_last_recorded_syscall_exit(0),
time_at_start_of_last_timeslice(0),
priority(0),
in_round_robin_queue(false),
emulated_ptracer(nullptr),
emulated_ptrace_event_msg(0),
emulated_ptrace_options(0),
emulated_ptrace_cont_command(0),
emulated_stop_pending(false),
emulated_ptrace_SIGCHLD_pending(false),
emulated_SIGCHLD_pending(false),
emulated_ptrace_seized(false),
emulated_ptrace_queued_exit_stop(false),
in_wait_type(WAIT_TYPE_NONE),
in_wait_pid(0),
emulated_stop_type(NOT_STOPPED),
blocked_sigs_dirty(true),
syscallbuf_blocked_sigs_generation(0),
flushed_num_rec_bytes(0),
flushed_syscallbuf(false),
delay_syscallbuf_reset_for_desched(false),
delay_syscallbuf_reset_for_seccomp_trap(false),
prctl_seccomp_status(0),
robust_futex_list_len(0),
own_namespace_rec_tid(0),
exit_code(0),
termination_signal(0),
tsc_mode(PR_TSC_ENABLE),
cpuid_mode(1),
stashed_signals_blocking_more_signals(false),
stashed_group_stop(false),
break_at_syscallbuf_traced_syscalls(false),
break_at_syscallbuf_untraced_syscalls(false),
break_at_syscallbuf_final_instruction(false),
next_pmc_interrupt_is_for_user(false),
did_record_robust_futex_changes(false) {
push_event(Event::sentinel());
if (session.tasks().empty()) {
// Initial tracee. It inherited its state from this process, so set it up.
// The very first task we fork inherits the signal
// dispositions of the current OS process (which should all be
// default at this point, but ...). From there on, new tasks
// will transitively inherit from this first task.
auto sh = Sighandlers::create();
sh->init_from_current_process();
sighandlers.swap(sh);
own_namespace_rec_tid = _tid;
}
}
RecordTask::~RecordTask() {
if (emulated_ptracer) {
emulated_ptracer->emulated_ptrace_tracees.erase(this);
if (emulated_ptrace_options & PTRACE_O_TRACEEXIT) {
ASSERT(this, stable_exit)
<< "PTRACE_O_TRACEEXIT only supported for stable exits for now";
}
}
for (RecordTask* t : emulated_ptrace_tracees) {
// XXX emulate PTRACE_O_EXITKILL
ASSERT(this, t->emulated_ptracer == this);
t->emulated_ptracer = nullptr;
t->emulated_ptrace_options = 0;
t->emulated_stop_pending = false;
t->emulated_stop_type = NOT_STOPPED;
}
// Task::destroy has already done PTRACE_DETACH so the task can complete
// exiting.
// The kernel explicitly only clears the futex if the address space is shared.
// If the address space has no other users then the futex will not be cleared
// even if it lives in shared memory which other tasks can read.
// Unstable exits may result in the kernel *not* clearing the
// futex, for example for fatal signals. So we would
// deadlock waiting on the futex.
if (!unstable && !tid_futex.is_null() && as->task_set().size() > 1) {
// clone()'d tasks can have a pid_t* |ctid| argument
// that's written with the new task's pid. That
// pointer can also be used as a futex: when the task
// dies, the original ctid value is cleared and a
// FUTEX_WAKE is done on the address. So
// pthread_join() is basically a standard futex wait
// loop.
LOG(debug) << " waiting for tid futex " << tid_futex
<< " to be cleared ...";
bool ok = true;
futex_wait(tid_futex, 0, &ok);
if (ok) {
int val = 0;
record_local(tid_futex, &val);
}
}
// Write the exit event here so that the value recorded above is captured.
// Don't flush syscallbuf. Whatever triggered the exit (syscall, signal)
// should already have flushed it, if it was running. If it was blocked,
// then the syscallbuf would already have been flushed too. The exception
// is kill_all_tasks() in which case it's OK to just drop the last chunk of
// execution. Trying to flush syscallbuf for an exiting task could be bad,
// e.g. it could be in the middle of syscallbuf code that's supposed to be
// atomic. For the same reasons don't allow syscallbuf to be reset here.
record_event(Event::exit(), DONT_FLUSH_SYSCALLBUF, DONT_RESET_SYSCALLBUF);
// We expect tasks to usually exit by a call to exit() or
// exit_group(), so it's not helpful to warn about that.
if (EV_SENTINEL != ev().type() &&
(pending_events.size() > 2 ||
!(ev().type() == EV_SYSCALL &&
(is_exit_syscall(ev().Syscall().number, ev().Syscall().regs.arch()) ||
is_exit_group_syscall(ev().Syscall().number,
ev().Syscall().regs.arch()))))) {
LOG(warn) << tid << " still has pending events. From top down:";
log_pending_events();
}
}
void RecordTask::futex_wait(remote_ptr<int> futex, int val, bool* ok) {
// Wait for *sync_addr == sync_val. This implementation isn't
// pretty, but it's pretty much the best we can do with
// available kernel tools.
//
// TODO: find clever way to avoid busy-waiting.
while (true) {
int mem = read_mem(futex, ok);
if (!*ok || val == mem) {
// Invalid addresses are just ignored by the kernel
break;
}
// Try to give our scheduling slot to the kernel
// thread that's going to write sync_addr.
sched_yield();
}
}
RecordSession& RecordTask::session() const {
return *Task::session().as_record();
}
TraceWriter& RecordTask::trace_writer() const {
return session().trace_writer();
}
Task* RecordTask::clone(CloneReason reason, int flags, remote_ptr<void> stack,
remote_ptr<void> tls, remote_ptr<int> cleartid_addr,
pid_t new_tid, pid_t new_rec_tid, uint32_t new_serial,
Session* other_session) {
ASSERT(this, reason == Task::TRACEE_CLONE);
Task* t = Task::clone(reason, flags, stack, tls, cleartid_addr, new_tid,
new_rec_tid, new_serial, other_session);
if (t->session().is_recording()) {
RecordTask* rt = static_cast<RecordTask*>(t);
if (CLONE_CLEARTID & flags) {
LOG(debug) << "cleartid futex is " << cleartid_addr;
ASSERT(this, !cleartid_addr.is_null());
rt->tid_futex = cleartid_addr;
} else {
LOG(debug) << "(clone child not enabling CLEARTID)";
}
}
return t;
}
void RecordTask::post_wait_clone(Task* cloned_from, int flags) {
ASSERT(cloned_from, cloned_from->session().is_recording());
Task::post_wait_clone(cloned_from, flags);
RecordTask* rt = static_cast<RecordTask*>(cloned_from);
priority = rt->priority;
syscallbuf_code_layout = rt->syscallbuf_code_layout;
prctl_seccomp_status = rt->prctl_seccomp_status;
robust_futex_list = rt->robust_futex_list;
robust_futex_list_len = rt->robust_futex_list_len;
tsc_mode = rt->tsc_mode;
cpuid_mode = rt->cpuid_mode;
if (CLONE_SHARE_SIGHANDLERS & flags) {
sighandlers = rt->sighandlers;
} else {
auto sh = rt->sighandlers->clone();
sighandlers.swap(sh);
}
update_own_namespace_tid();
}
static string exe_path(RecordTask* t) {
char proc_exe[PATH_MAX];
snprintf(proc_exe, sizeof(proc_exe), "/proc/%d/exe", t->tid);
char exe[PATH_MAX];
ssize_t ret = readlink(proc_exe, exe, sizeof(exe) - 1);
ASSERT(t, ret >= 0);
exe[ret] = 0;
return exe;
}
void RecordTask::post_exec() {
// Change syscall number to execve *for the new arch*. If we don't do this,
// and the arch changes, then the syscall number for execve in the old arch/
// is treated as the syscall we're executing in the new arch, with hilarious
// results.
int syscallno = syscall_number_for_execve(arch());
registers.set_original_syscallno(syscallno);
// Fix event architecture and syscall number
ev().Syscall().number = syscallno;
ev().Syscall().set_arch(arch());
// The signal mask is inherited across execve so we don't need to invalidate.
string exe_file = exe_path(this);
Task::post_exec(exe_file);
if (emulated_ptracer) {
ASSERT(this, !(emulated_ptracer->arch() == x86 && arch() == x86_64))
<< "We don't support a 32-bit process tracing a 64-bit process";
}
// Clear robust_list state to match kernel state. If this task is cloned
// soon after exec, we must not do a bogus set_robust_list syscall for
// the clone.
set_robust_list(nullptr, 0);
sighandlers = sighandlers->clone();
sighandlers->reset_user_handlers(arch());
// Newly execed tasks always have non-faulting mode (from their point of
// view, even if rr is secretly causing faults).
cpuid_mode = 1;
}
template <typename Arch> static void do_preload_init_arch(RecordTask* t) {
auto params = t->read_mem(
remote_ptr<rrcall_init_preload_params<Arch>>(t->regs().arg1()));
t->syscallbuf_code_layout.syscallbuf_final_exit_instruction =
params.syscallbuf_final_exit_instruction.rptr().as_int();
t->syscallbuf_code_layout.syscallbuf_code_start =
params.syscallbuf_code_start.rptr().as_int();
t->syscallbuf_code_layout.syscallbuf_code_end =
params.syscallbuf_code_end.rptr().as_int();
t->syscallbuf_code_layout.get_pc_thunks_start =
params.get_pc_thunks_start.rptr().as_int();
t->syscallbuf_code_layout.get_pc_thunks_end =
params.get_pc_thunks_end.rptr().as_int();
unsigned char in_chaos = t->session().enable_chaos();
auto in_chaos_ptr REMOTE_PTR_FIELD(params.globals.rptr(), in_chaos);
t->write_mem(in_chaos_ptr, in_chaos);
t->record_local(in_chaos_ptr, &in_chaos);
int cores = t->session().scheduler().pretend_num_cores();
auto cores_ptr = REMOTE_PTR_FIELD(params.globals.rptr(), pretend_num_cores);
t->write_mem(cores_ptr, cores);
t->record_local(cores_ptr, &cores);
uint64_t random_seed;
do {
random_seed = rand() | (uint64_t(rand()) << 32);
} while (!random_seed);
auto random_seed_ptr REMOTE_PTR_FIELD(params.globals.rptr(), random_seed);
t->write_mem(random_seed_ptr, random_seed);
t->record_local(random_seed_ptr, &random_seed);
}
void RecordTask::push_syscall_event(int syscallno) {
push_event(SyscallEvent(syscallno, detect_syscall_arch()));
}
static void do_preload_init(RecordTask* t) {
RR_ARCH_FUNCTION(do_preload_init_arch, t->arch(), t);
}
void RecordTask::at_preload_init() {
Task::at_preload_init();
do_preload_init(this);
}
/**
* Avoid using low-numbered file descriptors since that can confuse
* developers.
*/
static int find_free_file_descriptor(pid_t for_tid) {
int fd = 300 + (for_tid % 500);
while (true) {
char buf[PATH_MAX];
sprintf(buf, "/proc/%d/fd/%d", for_tid, fd);
if (access(buf, F_OK) == -1 && errno == ENOENT) {
return fd;
}
++fd;
}
}
template <typename Arch> void RecordTask::init_buffers_arch() {
// NB: the tracee can't be interrupted with a signal while
// we're processing the rrcall, because it's masked off all
// signals.
AutoRemoteSyscalls remote(this);
// Arguments to the rrcall.
remote_ptr<rrcall_init_buffers_params<Arch>> child_args = regs().arg1();
auto args = read_mem(child_args);
args.cloned_file_data_fd = -1;
if (as->syscallbuf_enabled()) {
args.syscallbuf_size = syscallbuf_size = session().syscall_buffer_size();
KernelMapping syscallbuf_km = init_syscall_buffer(remote, nullptr);
args.syscallbuf_ptr = syscallbuf_child;
desched_fd_child = args.desched_counter_fd;
// Prevent the child from closing this fd
fds->add_monitor(desched_fd_child, new PreserveFileMonitor());
desched_fd = remote.retrieve_fd(desched_fd_child);
auto record_in_trace = trace_writer().write_mapped_region(
this, syscallbuf_km, syscallbuf_km.fake_stat(),
vector<TraceRemoteFd>(),
TraceWriter::RR_BUFFER_MAPPING);
ASSERT(this, record_in_trace == TraceWriter::DONT_RECORD_IN_TRACE);
if (trace_writer().supports_file_data_cloning() &&
session().use_read_cloning()) {
string clone_file_name = trace_writer().file_data_clone_file_name(tuid());
AutoRestoreMem name(remote, clone_file_name.c_str());
int cloned_file_data = remote.syscall(syscall_number_for_openat(arch()),
RR_RESERVED_ROOT_DIR_FD, name.get(),
O_RDWR | O_CREAT | O_CLOEXEC, 0600);
if (cloned_file_data >= 0) {
int free_fd = find_free_file_descriptor(tid);
cloned_file_data_fd_child =
remote.syscall(syscall_number_for_dup3(arch()), cloned_file_data,
free_fd, O_CLOEXEC);
if (cloned_file_data_fd_child != free_fd) {
ASSERT(this, cloned_file_data_fd_child < 0);
LOG(warn) << "Couldn't dup clone-data file to free fd";
cloned_file_data_fd_child = cloned_file_data;
} else {
// Prevent the child from closing this fd. We're going to close it
// ourselves and we don't want the child closing it and then reopening
// its own file with this fd.
fds->add_monitor(cloned_file_data_fd_child,
new PreserveFileMonitor());
remote.infallible_syscall(syscall_number_for_close(arch()),
cloned_file_data);
}
args.cloned_file_data_fd = cloned_file_data_fd_child;
}
}
} else {
args.syscallbuf_ptr = remote_ptr<void>(nullptr);
args.syscallbuf_size = 0;
}
args.scratch_buf = scratch_ptr;
args.usable_scratch_size = usable_scratch_size();
// Return the mapped buffers to the child.
write_mem(child_args, args);
// The tracee doesn't need this addr returned, because it's
// already written to the inout |args| param, but we stash it
// away in the return value slot so that we can easily check
// that we map the segment at the same addr during replay.
remote.regs().set_syscall_result(syscallbuf_child);
}
void RecordTask::init_buffers() { RR_ARCH_FUNCTION(init_buffers_arch, arch()); }
template <typename Arch>
void RecordTask::on_syscall_exit_arch(int syscallno, const Registers& regs) {
if (regs.original_syscallno() == SECCOMP_MAGIC_SKIP_ORIGINAL_SYSCALLNO ||
regs.syscall_failed()) {
return;
}
switch (syscallno) {
case Arch::set_robust_list:
set_robust_list(regs.arg1(), (size_t)regs.arg2());
return;
case Arch::sigaction:
case Arch::rt_sigaction:
// TODO: SYS_signal
update_sigaction(regs);
return;
case Arch::set_tid_address:
set_tid_addr(regs.arg1());
return;
case Arch::sigsuspend:
case Arch::rt_sigsuspend:
case Arch::sigprocmask:
case Arch::rt_sigprocmask:
case Arch::pselect6:
case Arch::ppoll:
invalidate_sigmask();
return;
}
}
void RecordTask::on_syscall_exit(int syscallno, SupportedArch arch,
const Registers& regs) {
with_converted_registers<void>(regs, arch, [&](const Registers& regs) {
Task::on_syscall_exit(syscallno, arch, regs);
RR_ARCH_FUNCTION(on_syscall_exit_arch, arch, syscallno, regs)
});
}
bool RecordTask::is_at_syscallbuf_syscall_entry_breakpoint() {
auto i = ip().decrement_by_bkpt_insn_length(arch());
for (auto p : syscallbuf_syscall_entry_breakpoints()) {
if (i == p) {
return true;
}
}
return false;
}
bool RecordTask::is_at_syscallbuf_final_instruction_breakpoint() {
if (!break_at_syscallbuf_final_instruction) {
return false;
}
auto i = ip().decrement_by_bkpt_insn_length(arch());
return i == syscallbuf_code_layout.syscallbuf_final_exit_instruction;
}
void RecordTask::will_resume_execution(ResumeRequest, WaitRequest,
TicksRequest ticks_request, int sig) {
// We may execute user code, which could lead to an RDTSC or grow-map
// operation which unblocks SIGSEGV, and we'll need to know whether to
// re-block it. So we need our cached sigmask to be up to date.
// We don't need to this if we're not going to execute user code
// (i.e. ticks_request == RESUME_NO_TICKS) except that did_wait can't
// easily check for that and may restore blocked_sigs so it had better be
// accurate.
get_sigmask();
if (stashed_signals_blocking_more_signals) {
// A stashed signal we have already accepted for this task may
// have a sigaction::sa_mask that would block the next signal to be
// delivered and cause it to be delivered to a different task. If we allow
// such a signal to be delivered to this task then we run the risk of never
// being able to process the signal (if it stays blocked indefinitely).
// To prevent this, block any further signal delivery as long as there are
// stashed signals.
// We assume the kernel can't report a new signal of the same number
// in response to us injecting a signal. XXX is this true??? We don't
// have much choice, signal injection won't work if we block the signal.
// We leave rr signals unblocked. TIME_SLICE_SIGNAL has to be unblocked
// because blocking it seems to cause problems for some hardware/kernel
// configurations (see https://github.com/mozilla/rr/issues/1979),
// causing them to stop counting events.
sig_set_t sigset = ~(signal_bit(SYSCALLBUF_DESCHED_SIGNAL) |
signal_bit(PerfCounters::TIME_SLICE_SIGNAL));
if (sig) {
// We're injecting a signal, so make sure that signal is unblocked.
sigset &= ~signal_bit(sig);
}
int ret = fallible_ptrace(PTRACE_SETSIGMASK, remote_ptr<void>(8), &sigset);
if (ret < 0) {
if (errno == EIO) {
FATAL() << "PTRACE_SETSIGMASK not supported; rr requires Linux kernel >= 3.11";
}
ASSERT(this, errno == EINVAL);
} else {
LOG(debug) << "Set signal mask to block all signals (bar "
<< "SYSCALLBUF_DESCHED_SIGNAL/TIME_SLICE_SIGNAL) while we "
<< " have a stashed signal";
}
}
// RESUME_NO_TICKS means that tracee code is not going to run so there's no
// need to set breakpoints and in fact they might interfere with rr
// processing.
if (ticks_request != RESUME_NO_TICKS) {
if (!at_may_restart_syscall()) {
// If the tracee has SIGTRAP blocked or ignored and we hit one of these
// breakpoints, the kernel will automatically unblock the signal and set
// its disposition to DFL, effects which we ought to undo to keep these
// SIGTRAPs invisible to tracees. Fixing the sigmask happens
// automatically in did_wait(). Restoring the signal-ignored status is
// handled in `handle_syscallbuf_breakpoint`.
// Set breakpoints at untraced syscalls to catch us entering an untraced
// syscall. We don't need to do this (and shouldn't do this) if the
// execution requestor wants to stop inside untraced syscalls.
// If we have an interrupted syscall that we may restart, don't
// set the breakpoints because we should restart the syscall instead
// of breaking and delivering signals. The syscallbuf code doesn't
// (and must not) perform more than one blocking syscall for any given
// buffered syscall.
for (auto p : syscallbuf_syscall_entry_breakpoints()) {
vm()->add_breakpoint(p, BKPT_INTERNAL);
}
}
if (break_at_syscallbuf_final_instruction) {
vm()->add_breakpoint(
syscallbuf_code_layout.syscallbuf_final_exit_instruction,
BKPT_INTERNAL);
}
}
}
vector<remote_code_ptr> RecordTask::syscallbuf_syscall_entry_breakpoints() {
vector<remote_code_ptr> result;
if (break_at_syscallbuf_untraced_syscalls) {
result.push_back(AddressSpace::rr_page_syscall_entry_point(
AddressSpace::UNTRACED, AddressSpace::UNPRIVILEGED,
AddressSpace::RECORDING_ONLY, arch()));
result.push_back(AddressSpace::rr_page_syscall_entry_point(
AddressSpace::UNTRACED, AddressSpace::UNPRIVILEGED,
AddressSpace::RECORDING_AND_REPLAY, arch()));
}
if (break_at_syscallbuf_traced_syscalls) {
result.push_back(AddressSpace::rr_page_syscall_entry_point(
AddressSpace::TRACED, AddressSpace::UNPRIVILEGED,
AddressSpace::RECORDING_AND_REPLAY, arch()));
}
return result;
}
void RecordTask::did_wait() {
for (auto p : syscallbuf_syscall_entry_breakpoints()) {
vm()->remove_breakpoint(p, BKPT_INTERNAL);
}
if (break_at_syscallbuf_final_instruction) {
vm()->remove_breakpoint(
syscallbuf_code_layout.syscallbuf_final_exit_instruction,
BKPT_INTERNAL);
}
if (stashed_signals_blocking_more_signals) {
// Saved 'blocked_sigs' must still be correct regardless of syscallbuf
// state, because we do not allow stashed_signals_blocking_more_signals to
// hold across syscalls (traced or untraced) that change the signal mask.
ASSERT(this, !blocked_sigs_dirty);
xptrace(PTRACE_SETSIGMASK, remote_ptr<void>(8), &blocked_sigs);
} else if (syscallbuf_child) {
// The syscallbuf struct is only 32 bytes currently so read the whole thing
// at once to avoid multiple calls to read_mem. Even though this shouldn't
// need a syscall because we use a local-mapping, apparently that lookup
// is still noticeably expensive.
auto syscallbuf = read_mem(syscallbuf_child);
if (syscallbuf.in_sigprocmask_critical_section) {
// |blocked_sigs| may have been updated but the syscall not yet issued.
// Use the kernel's value.
invalidate_sigmask();
} else {
uint32_t syscallbuf_generation = syscallbuf.blocked_sigs_generation;
if (syscallbuf_generation > syscallbuf_blocked_sigs_generation) {
syscallbuf_blocked_sigs_generation = syscallbuf_generation;
blocked_sigs = syscallbuf.blocked_sigs;
}
}
}
}
void RecordTask::set_emulated_ptracer(RecordTask* tracer) {
if (tracer) {
ASSERT(this, !emulated_ptracer);
emulated_ptracer = tracer;
emulated_ptracer->emulated_ptrace_tracees.insert(this);
} else {
ASSERT(this, emulated_ptracer);
ASSERT(this,
emulated_stop_type == NOT_STOPPED ||
emulated_stop_type == GROUP_STOP);
emulated_ptracer->emulated_ptrace_tracees.erase(this);
emulated_ptracer = nullptr;
}
}
bool RecordTask::emulate_ptrace_stop(WaitStatus status,
const siginfo_t* siginfo, int si_code) {
ASSERT(this, emulated_stop_type == NOT_STOPPED);
if (!emulated_ptracer) {
return false;
}
if (siginfo) {
ASSERT(this, status.ptrace_signal() == siginfo->si_signo);
save_ptrace_signal_siginfo(*siginfo);
} else {
siginfo_t si;
memset(&si, 0, sizeof(si));
si.si_signo = status.ptrace_signal();
if (status.ptrace_event() || status.is_syscall()) {
si.si_code = status.get() >> 8;
} else {
si.si_code = si_code;
}
save_ptrace_signal_siginfo(si);
}
force_emulate_ptrace_stop(status);
return true;
}
void RecordTask::force_emulate_ptrace_stop(WaitStatus status) {
emulated_stop_type = status.group_stop() ? GROUP_STOP : SIGNAL_DELIVERY_STOP;
emulated_stop_code = status;
emulated_stop_pending = true;
emulated_ptrace_SIGCHLD_pending = true;
emulated_ptracer->send_synthetic_SIGCHLD_if_necessary();
// The SIGCHLD will eventually be reported to rr via a ptrace stop,
// interrupting wake_task's syscall (probably a waitpid) if necessary. At
// that point, we'll fix up the siginfo data with values that match what
// the kernel would have delivered for a real ptracer's SIGCHLD. When the
// signal handler (if any) returns, if wake_task was in a blocking wait that
// wait will be resumed, at which point rec_prepare_syscall_arch will
// discover the pending ptrace result and emulate the wait syscall to
// return that result immediately.
}
void RecordTask::send_synthetic_SIGCHLD_if_necessary() {
RecordTask* wake_task = nullptr;
bool need_signal = false;
for (RecordTask* tracee : emulated_ptrace_tracees) {
if (tracee->emulated_ptrace_SIGCHLD_pending) {
need_signal = true;
// check to see if any thread in the ptracer process is in a waitpid that
// could read the status of 'tracee'. If it is, we should wake up that
// thread. Otherwise we send SIGCHLD to the ptracer thread.
for (Task* t : thread_group()->task_set()) {
auto rt = static_cast<RecordTask*>(t);
if (rt->is_waiting_for_ptrace(tracee)) {
wake_task = rt;
break;
}
}
if (wake_task) {
break;
}
}
}
if (!need_signal) {
for (ThreadGroup* child_tg : thread_group()->children()) {
for (Task* child : child_tg->task_set()) {
RecordTask* rchild = static_cast<RecordTask*>(child);
if (rchild->emulated_SIGCHLD_pending) {
need_signal = true;
// check to see if any thread in the ptracer process is in a waitpid
// that
// could read the status of 'tracee'. If it is, we should wake up that
// thread. Otherwise we send SIGCHLD to the ptracer thread.
for (Task* t : thread_group()->task_set()) {
auto rt = static_cast<RecordTask*>(t);
if (rt->is_waiting_for(rchild)) {
wake_task = rt;
break;
}
}
if (wake_task) {
break;
}
}
}
}
if (!need_signal) {
return;
}
}
// ptrace events trigger SIGCHLD in the ptracer's wake_task.
// We can't set all the siginfo values to their correct values here, so
// we'll patch this up when the signal is received.
// If there's already a pending SIGCHLD, this signal will be ignored,
// but at some point the pending SIGCHLD will be delivered and then
// send_synthetic_SIGCHLD_if_necessary will be called again to deliver a new
// SIGCHLD if necessary.
siginfo_t si;
memset(&si, 0, sizeof(si));
si.si_code = SI_QUEUE;
si.si_value.sival_int = SIGCHLD_SYNTHETIC;
int ret;
if (wake_task) {
LOG(debug) << "Sending synthetic SIGCHLD to tid " << wake_task->tid;
// We must use the raw SYS_rt_tgsigqueueinfo syscall here to ensure the
// signal is sent to the correct thread by tid.
ret = syscall(SYS_rt_tgsigqueueinfo, wake_task->tgid(), wake_task->tid,
SIGCHLD, &si);
ASSERT(this, ret == 0);
if (wake_task->is_sig_blocked(SIGCHLD)) {
// Just sending SIGCHLD won't wake it up. Send it a TIME_SLICE_SIGNAL
// as well to make sure it exits a blocking syscall. We ensure those
// can never be blocked.
// We have to send a negative code here because only the kernel can set
// positive codes. We set a magic number so we can recognize it
// when received.
si.si_code = SYNTHETIC_TIME_SLICE_SI_CODE;
ret = syscall(SYS_rt_tgsigqueueinfo, wake_task->tgid(), wake_task->tid,
PerfCounters::TIME_SLICE_SIGNAL, &si);
ASSERT(this, ret == 0);
}
} else {
// Send the signal to the process as a whole and let the kernel
// decide which thread gets it.
ret = syscall(SYS_rt_sigqueueinfo, tgid(), SIGCHLD, &si);
ASSERT(this, ret == 0);
LOG(debug) << "Sending synthetic SIGCHLD to pid " << tgid();
}
}
static bool is_synthetic_SIGCHLD(const siginfo_t& si) {
return si.si_signo == SIGCHLD && si.si_value.sival_int == SIGCHLD_SYNTHETIC;
}
bool RecordTask::set_siginfo_for_synthetic_SIGCHLD(siginfo_t* si) {
if (!is_synthetic_SIGCHLD(*si)) {
return true;
}
if (is_syscall_restart()) {
// ptrace generated signals don't interrupt syscalls such as wait.
// Return false to tell the caller to defer the signal and resume
// the syscall.
return false;
}
for (RecordTask* tracee : emulated_ptrace_tracees) {
if (tracee->emulated_ptrace_SIGCHLD_pending) {
tracee->emulated_ptrace_SIGCHLD_pending = false;
tracee->set_siginfo_for_waited_task<NativeArch>(
reinterpret_cast<NativeArch::siginfo_t*>(si));
si->si_value.sival_int = 0;
return true;
}
}
for (ThreadGroup* child_tg : thread_group()->children()) {
for (Task* child : child_tg->task_set()) {
auto rchild = static_cast<RecordTask*>(child);
if (rchild->emulated_SIGCHLD_pending) {
rchild->emulated_SIGCHLD_pending = false;
rchild->set_siginfo_for_waited_task<NativeArch>(
reinterpret_cast<NativeArch::siginfo_t*>(si));
si->si_value.sival_int = 0;
return true;
}
}
}
return true;
}
bool RecordTask::is_waiting_for_ptrace(RecordTask* t) {
// This task's process must be a ptracer of t.
if (!t->emulated_ptracer ||
t->emulated_ptracer->thread_group() != thread_group()) {
return false;
}
// XXX need to check |options| to make sure this task is eligible!!
switch (in_wait_type) {
case WAIT_TYPE_NONE:
return false;
case WAIT_TYPE_ANY:
return true;
case WAIT_TYPE_SAME_PGID:
return getpgid(t->tgid()) == getpgid(tgid());
case WAIT_TYPE_PGID:
return getpgid(t->tgid()) == in_wait_pid;
case WAIT_TYPE_PID:
// When waiting for a ptracee, a specific pid is interpreted as the
// exact tid.
return t->tid == in_wait_pid;
default:
ASSERT(this, false);
return false;
}
}
bool RecordTask::is_waiting_for(RecordTask* t) {
// t must be a child of this task.
if (t->thread_group()->parent() != thread_group().get()) {
return false;
}
switch (in_wait_type) {
case WAIT_TYPE_NONE:
return false;
case WAIT_TYPE_ANY:
return true;
case WAIT_TYPE_SAME_PGID:
return getpgid(t->tgid()) == getpgid(tgid());
case WAIT_TYPE_PGID:
return getpgid(t->tgid()) == in_wait_pid;
case WAIT_TYPE_PID:
return t->tgid() == in_wait_pid;
default:
ASSERT(this, false);
return false;
}
}
void RecordTask::save_ptrace_signal_siginfo(const siginfo_t& si) {
for (auto it = saved_ptrace_siginfos.begin();
it != saved_ptrace_siginfos.end(); ++it) {
if (it->si_signo == si.si_signo) {
saved_ptrace_siginfos.erase(it);
break;
}
}
saved_ptrace_siginfos.push_back(si);
}
siginfo_t& RecordTask::get_saved_ptrace_siginfo() {
int sig = emulated_stop_code.ptrace_signal();
ASSERT(this, sig > 0);
for (auto it = saved_ptrace_siginfos.begin();
it != saved_ptrace_siginfos.end(); ++it) {
if (it->si_signo == sig) {
return *it;
}
}
ASSERT(this, false) << "No saved siginfo found for stop-signal???";
while (true) {
// Avoid having to return anything along this (unreachable) path
}
}
siginfo_t RecordTask::take_ptrace_signal_siginfo(int sig) {
for (auto it = saved_ptrace_siginfos.begin();
it != saved_ptrace_siginfos.end(); ++it) {
if (it->si_signo == sig) {
siginfo_t si = *it;
saved_ptrace_siginfos.erase(it);
return si;
}
}
siginfo_t si;
memset(&si, 0, sizeof(si));
si.si_signo = sig;
return si;
}
static pid_t get_ppid(pid_t pid) {
auto ppid_str = read_proc_status_fields(pid, "PPid");
if (ppid_str.empty()) {
return -1;
}
char* end;
int actual_ppid = strtol(ppid_str[0].c_str(), &end, 10);
return *end ? -1 : actual_ppid;
}
void RecordTask::apply_group_stop(int sig) {
if (emulated_stop_type == NOT_STOPPED) {
LOG(debug) << "setting " << tid << " to GROUP_STOP due to signal " << sig;
WaitStatus status = WaitStatus::for_group_sig(sig, this);
if (!emulate_ptrace_stop(status)) {
emulated_stop_type = GROUP_STOP;
emulated_stop_code = status;