/
preload.c
1842 lines (1617 loc) · 53.5 KB
/
preload.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/* -*- Mode: C; tab-width: 8; c-basic-offset: 8; indent-tabs-mode: t; -*- */
//#define DEBUGTAG "rrpreload"
#include "syscall_buffer.h"
/**
* Buffer syscalls, so that rr can process the entire buffer with one
* trap instead of a trap per call.
*
* This file is compiled into a dso that's PRELOADed in recorded
* applications. The dso replaces libc syscall wrappers with our own
* implementation that saves nondetermistic outparams in a fixed-size
* buffer. When the buffer is full or the recorded application
* invokes an un-buffered syscall or receives a signal, we trap to rr
* and it records the state of the buffer.
*
* During replay, rr simply refills the buffer with the recorded data
* when it reaches the "flush-buffer" events that were recorded. Then
* rr emulates each buffered syscall, and the code here restores the
* client data from the refilled buffer.
*
* The crux of the implementation here is to selectively ptrace-trap
* syscalls. The normal (un-buffered) syscalls generate a ptrace
* trap, and the buffered syscalls trap directly to the kernel. This
* is implemented with a seccomp-bpf which examines the syscall and
* decides how to handle it (see seccomp-bpf.h).
*
* Because this code runs in the tracee's address space and overrides
* system calls, the code is rather delicate. The following rules
* must be followed
*
* o No rr headers (other than seccomp-bpf.h and rr.h) may be included
* o All syscalls invoked by this code must be called directly, not
* through libc wrappers (which this file may itself indirectly override)
*/
/**
* We also use this preload library to disable XShm by overriding
* XShmQueryExtension.
*/
#include <fcntl.h>
#include <limits.h>
#include <link.h>
#include <linux/futex.h>
#include <linux/net.h>
#include <linux/perf_event.h>
#include <poll.h>
#include <pthread.h>
#include <signal.h>
#include <stdarg.h>
#include <stdlib.h>
#include <stdio.h>
#include <syscall.h>
#include <sys/epoll.h>
#include <sysexits.h>
#include <sys/file.h>
#include <sys/mman.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/uio.h>
#include <time.h>
#include <unistd.h>
/* NB: don't include any other local headers here. */
#include "rr/rr.h"
#include "seccomp-bpf.h"
#ifdef memcpy
# undef memcpy
#endif
#define memcpy you_must_use_local_memcpy
#ifdef syscall
# undef syscall
#endif
#define syscall you_must_use_traced_syscall
/**
* Copy |_rhs| to |_lhs|. If the copy overflows, set errno to
* EOVERFLOW and return -1.
*
* WARNING: this macro affects control flow, use with great care.
*/
#define COPY_CHECK_OVERFLOW(_lhs, _rhs) \
do { \
_lhs = _rhs; \
if (sizeof(_lhs) != sizeof(_rhs) && (_lhs) != (_rhs)) { \
errno = EOVERFLOW; \
return -1; \
} \
} while(0)
/**
* Represents syscall params. Makes it simpler to pass them around,
* and avoids pushing/popping all the data for calls.
*/
struct syscall_info {
long no;
long args[6];
};
/* Nonzero when syscall buffering is enabled. */
static int buffer_enabled;
/* Nonzero after process-global state like the seccomp-bpf has been
* initialized. */
static int process_inited;
/* Nonzero when thread-local state like the syscallbuf has been
* initialized. */
static __thread int thread_inited;
/* When buffering is enabled, points at the thread's mapped buffer
* segment. At the start of the segment is an object of type |struct
* syscallbuf_hdr|, so |buffer| is also a pointer to the buffer
* header. */
static __thread byte* buffer;
/* This is used to support the buffering of "may-block" system calls.
* The problem that needs to be addressed can be introduced with a
* simple example; assume that we're buffering the "read" and "write"
* syscalls.
*
* o (Tasks W and R set up a synchronous-IO pipe open between them; W
* "owns" the write end of the pipe; R owns the read end; the pipe
* buffer is full)
* o Task W invokes the write syscall on the pipe
* o Since write is a buffered syscall, the seccomp filter traps W
* directly to the kernel; there's no trace event for W delivered
* to rr.
* o The pipe is full, so W is descheduled by the kernel because W
* can't make progress.
* o rr thinks W is still running and doesn't schedule R.
*
* At this point, progress in the recorded application can only be
* made by scheduling R, but no one tells rr to do that. Oops!
*
* Thus enter the "desched counter". It's a perf_event for the "sw t
* switches" event (which, more precisely, is "sw deschedule"; it
* counts schedule-out, not schedule-in). We program the counter to
* deliver a signal to this task when there's new counter data
* available. And we set up the "sample period", how many descheds
* are triggered before the signal is delivered, to be "1". This
* means that when the counter is armed, the next desched (i.e., the
* next time the desched counter is bumped up) of this task will
* deliver the signal to it. And signal delivery always generates a
* ptrace trap, so rr can deduce that this task was descheduled and
* schedule another.
*
* The description above is sort of an idealized view; there are
* numerous implementation details that are documented in
* handle_signal.c, where they're dealt with. */
static __thread int desched_counter_fd;
/* Points at the libc/pthread pthread_create(). We wrap
* pthread_create, so need to retain this pointer to call out to the
* libc version. */
static int (*real_pthread_create)(pthread_t* thread,
const pthread_attr_t* attr,
void* (*start_routine) (void*), void* arg);
/* Points at the libc/pthread pthread_mutex_lock(). We wrap
* pthread_mutex_lock, so need to retain this pointer to call out to the
* libc version. */
static int (*real_pthread_mutex_lock)(pthread_mutex_t* mutex);
/* Points at the libc/pthread pthread_mutex_timedlock(). We wrap
* pthread_mutex_timedlock, so need to retain this pointer to call out to the
* libc version. */
static int (*real_pthread_mutex_timedlock)(pthread_mutex_t* mutex,
const struct timespec *abstime);
/* Points at the libc/pthread pthread_mutex_trylock(). We wrap
* pthread_mutex_trylock, so need to retain this pointer to call out to the
* libc version. */
static int (*real_pthread_mutex_trylock)(pthread_mutex_t* mutex);
/**
* Return a pointer to the buffer header, which happens to occupy the
* initial bytes in the mapped region.
*/
static struct syscallbuf_hdr* buffer_hdr(void)
{
return (struct syscallbuf_hdr*)buffer;
}
/**
* Return a pointer to the byte just after the last valid syscall record in
* the buffer.
*/
static byte* buffer_last(void)
{
return (byte*)next_record(buffer_hdr());
}
/**
* Return a pointer to the byte just after the very end of the mapped
* region.
*/
static byte* buffer_end(void)
{
return buffer + SYSCALLBUF_BUFFER_SIZE;
}
/**
* Same as libc memcpy(), but usable within syscallbuf transaction
* critical sections.
*/
static void* local_memcpy(void* dest, const void* source, size_t n)
{
char* dst = dest;
const char* src = source;
while (n--) *dst++ = *src++;
return dest;
}
/* The following are wrappers for the syscalls invoked by this library
* itself. These syscalls will generate ptrace traps. */
extern long _traced_raw_syscall(int syscallno, long a0, long a1, long a2,
long a3, long a4, long a5);
static int update_errno_ret(long ret)
{
/* EHWPOISON is the last known errno as of linux 3.9.5. */
if (0 > ret && ret >= -EHWPOISON) {
errno = -ret;
ret = -1;
}
return ret;
}
static int traced_syscall(int syscallno, long a0, long a1, long a2,
long a3, long a4, long a5)
{
long ret = _traced_raw_syscall(syscallno, a0, a1, a2, a3, a4, a5);
return update_errno_ret(ret);
}
#define traced_syscall6(no, a0, a1, a2, a3, a4, a5) \
traced_syscall(no, (uintptr_t)a0, (uintptr_t)a1, (uintptr_t)a2, (uintptr_t)a3, (uintptr_t)a4, (uintptr_t)a5)
#define traced_syscall5(no, a0, a1, a2, a3, a4) \
traced_syscall6(no, a0, a1, a2, a3, a4, 0)
#define traced_syscall4(no, a0, a1, a2, a3) \
traced_syscall5(no, a0, a1, a2, a3, 0)
#define traced_syscall3(no, a0, a1, a2) \
traced_syscall4(no, a0, a1, a2, 0)
#define traced_syscall2(no, a0, a1) \
traced_syscall3(no, a0, a1, 0)
#define traced_syscall1(no, a0) \
traced_syscall2(no, a0, 0)
#define traced_syscall0(no) \
traced_syscall1(no, 0)
/**
* Make a raw traced syscall using the params in |call|. "Raw" traced
* syscalls return the raw kernel return value, and don't transform it
* to -1/errno per POSIX semantics.
*/
static long traced_raw_syscall(const struct syscall_info* call)
{
/* FIXME: pass |call| to avoid pushing these on the stack
* again. */
return _traced_raw_syscall(call->no,
call->args[0], call->args[1],
call->args[2], call->args[3],
call->args[4], call->args[5]);
}
extern void* get_traced_syscall_entry_point(void);
static int traced_fcntl(int fd, int cmd, ...)
{
va_list ap;
void *arg;
va_start(ap, cmd);
arg = va_arg(ap, void*);
va_end(ap);
return traced_syscall3(SYS_fcntl64, fd, cmd, arg);
}
static pid_t traced_getpid(void)
{
return traced_syscall0(SYS_getpid);
}
static pid_t traced_gettid(void)
{
return traced_syscall0(SYS_gettid);
}
static int traced_perf_event_open(struct perf_event_attr *attr,
pid_t pid, int cpu, int group_fd,
unsigned long flags)
{
return traced_syscall5(SYS_perf_event_open, attr, pid, cpu, group_fd, flags);
}
static int traced_prctl(int option, unsigned long arg2, unsigned long arg3,
unsigned long arg4, unsigned long arg5)
{
return traced_syscall5(SYS_prctl, option, arg2, arg3, arg4, arg5);
}
static int traced_raise(int sig)
{
return traced_syscall2(SYS_kill, traced_getpid(), sig);
}
static int traced_sigprocmask(int how, const sigset_t* set, sigset_t* oldset)
{
/* Warning: expecting this to only change the mask of the
* current task is a linux-ism; POSIX leaves the behavior
* undefined. */
return traced_syscall4(SYS_rt_sigprocmask, how, set, oldset,
_NSIG / 8);
}
static ssize_t traced_write(int fd, const void* buf, size_t count)
{
return traced_syscall3(SYS_write, fd, buf, count);
}
/* We can't use the rr logging helpers because they rely on libc
* syscall-invoking functions, so roll our own here.
*
* XXX just use these for all logging? */
__attribute__((format(printf, 1, 2)))
static void logmsg(const char* msg, ...)
{
va_list args;
char buf[1024];
int len;
va_start(args, msg);
len = vsnprintf(buf, sizeof(buf) - 1, msg, args);
va_end(args);
traced_write(STDERR_FILENO, buf, len);
}
#ifndef NDEBUG
# define assert(cond) \
do { \
if (!(cond)) { \
logmsg("%s:%d: Assertion `" #cond "' failed.\n", \
__FILE__, __LINE__); \
traced_raise(SIGABRT); \
} \
} while (0)
#else
# define assert(cond) ((void)0)
#endif
#define fatal(msg, ...) \
do { \
logmsg("[FATAL] (%s:%d: errno: %s: tid: %d) " msg "\n", \
__FILE__, __LINE__, strerror(errno), \
traced_gettid(), ##__VA_ARGS__); \
traced_syscall1(SYS_exit_group, EX_OSERR); \
} while (0)
#ifdef DEBUGTAG
# define debug(msg, ...) \
logmsg("[" DEBUGTAG "] " msg "\n", ##__VA_ARGS__)
#else
# define debug(msg, ...) ((void)0)
#endif
/**
* Mask out all signals in preparation for a critical section.
* Previous mask saved to |saved_mask|, which should be passed to
* |exit_signal_critical_section()|.
*/
static void enter_signal_critical_section(sigset_t* saved_mask)
{
sigset_t mask;
sigfillset(&mask);
traced_sigprocmask(SIG_BLOCK, &mask, saved_mask);
}
/**
* Restore |saved_mask|, after exiting critical section.
*/
static void exit_signal_critical_section(const sigset_t* saved_mask)
{
traced_sigprocmask(SIG_SETMASK, saved_mask, NULL);
}
/* Helpers for invoking untraced syscalls, which do *not* generate
* ptrace traps.
*
* XXX make a nice assembly helper like libc's |syscall()|? */
extern long _untraced_raw_syscall(int syscallno, long a0, long a1, long a2,
long a3, long a4, long a5);
/**
* Unlike |traced_syscall()|, this helper is implicitly "raw" (returns
* the direct kernel return value), because the vsyscall hooks have to
* save that raw return value.
*/
static long untraced_syscall(int syscallno, long a0, long a1, long a2,
long a3, long a4, long a5)
{
return _untraced_raw_syscall(syscallno, a0, a1, a2, a3, a4, a5);
}
#define untraced_syscall6(no, a0, a1, a2, a3, a4, a5) \
untraced_syscall(no, (uintptr_t)a0, (uintptr_t)a1, (uintptr_t)a2, (uintptr_t)a3, (uintptr_t)a4, (uintptr_t)a5)
#define untraced_syscall5(no, a0, a1, a2, a3, a4) \
untraced_syscall6(no, a0, a1, a2, a3, a4, 0)
#define untraced_syscall4(no, a0, a1, a2, a3) \
untraced_syscall5(no, a0, a1, a2, a3, 0)
#define untraced_syscall3(no, a0, a1, a2) \
untraced_syscall4(no, a0, a1, a2, 0)
#define untraced_syscall2(no, a0, a1) \
untraced_syscall3(no, a0, a1, 0)
#define untraced_syscall1(no, a0) \
untraced_syscall2(no, a0, 0)
#define untraced_syscall0(no) \
untraced_syscall1(no, 0)
extern void* get_untraced_syscall_entry_point(void);
/**
* Make the *un*traced socketcall |call| with the given args.
*
* NB: like untraced_syscall(), this helper *DOES NOT* touch the raw
* return value from the kernel. Callers must update errno
* themselves if necessary.
*/
static long untraced_socketcall(int call,
long a0, long a1, long a2, long a3, long a4)
{
unsigned long args[] = { a0, a1, a2, a3, a4 };
return untraced_syscall2(SYS_socketcall, call, args);
}
#define untraced_socketcall5(no, a0, a1, a2, a3, a4) \
untraced_socketcall(no, (uintptr_t)a0, (uintptr_t)a1, (uintptr_t)a2, (uintptr_t)a3, (uintptr_t)a4)
#define untraced_socketcall4(no, a0, a1, a2, a3) \
untraced_socketcall5(no, a0, a1, a2, a3, 0)
#define untraced_socketcall3(no, a0, a1, a2) \
untraced_socketcall4(no, a0, a1, a2, 0)
#define untraced_socketcall2(no, a0, a1) \
untraced_socketcall3(no, a0, a1, 0)
#define untraced_socketcall1(no, a0) \
untraced_socketcall2(no, a0, 0)
#define untraced_socketcall0(no) \
untraced_socketcall1(no, 0)
/**
* Call this hook from |__kernel_vsyscall()|, to buffer syscalls that
* we otherwise couldn't wrap through LD_PRELOAD helpers. Return the
* *RAW* kernel return value, not the -1/errno mandated by POSIX.
*
* Remember, this function runs *below* the level of libc. libc can't
* know that its call to |__kernel_vsyscall()| has been re-routed to
* us.
*/
static long vsyscall_hook(const struct syscall_info* call);
/**
* |__kernel_vsyscall()| is /actually/ patched to jump here. This
* trampoline then prepares a "real" call to |vsyscall_hook()|.
*/
__asm__(".text\n\t"
".globl _vsyscall_hook_trampoline\n\t"
".type _vsyscall_hook_trampoline, @function\n\t"
"_vsyscall_hook_trampoline:\n\t"
".cfi_startproc\n\t"
/* The monkeypatch pushed $eax on the stack, but there's no
* CFI info for it. Fix up the CFA offset here to account for
* the monkeypatch code. */
".cfi_adjust_cfa_offset 4\n\t"
".cfi_rel_offset %eax, 0\n\t"
/* Pull $eax back off the stack. Now our syscall-arg
* registers are restored to their state on entry to
* __kernel_vsyscall(). */
"popl %eax\n\t"
".cfi_adjust_cfa_offset -4\n\t"
".cfi_restore %eax\n\t"
/* Build a |struct syscall_info| by pushing all the syscall
* args and the number onto the stack. */
/* struct syscall_info info; */
"pushl %ebp\n\t" /* info.args[5] = $ebp; */
".cfi_adjust_cfa_offset 4\n\t"
".cfi_rel_offset %ebp, 0\n\t"
"pushl %edi\n\t" /* info.args[4] = $edi; */
".cfi_adjust_cfa_offset 4\n\t"
".cfi_rel_offset %edi, 0\n\t"
"pushl %esi\n\t" /* info.args[3] = $esi; */
".cfi_adjust_cfa_offset 4\n\t"
".cfi_rel_offset %esi, 0\n\t"
"pushl %edx\n\t" /* info.args[2] = $edx; */
".cfi_adjust_cfa_offset 4\n\t"
".cfi_rel_offset %edx, 0\n\t"
"pushl %ecx\n\t" /* info.args[1] = $ecx; */
".cfi_adjust_cfa_offset 4\n\t"
".cfi_rel_offset %ecx, 0\n\t"
"pushl %ebx\n\t" /* info.args[0] = $ebx; */
".cfi_adjust_cfa_offset 4\n\t"
".cfi_rel_offset %ebx, 0\n\t"
"pushl %eax\n\t" /* info.no = $eax; */
".cfi_adjust_cfa_offset 4\n\t"
/* $esp points at &info. Push that pointer on the stack as
* our arg for syscall_hook(). */
"movl %esp, %ecx\n\t"
"pushl %ecx\n\t"
".cfi_adjust_cfa_offset 4\n\t"
"movl $vsyscall_hook, %eax\n\t"
"call *%eax\n\t" /* $eax = vsyscall_hook(&info); */
/* $eax is now the syscall return value. Erase the |&info|
* arg and |info.no| from the stack so that we can restore the
* other registers we saved. */
"addl $8, %esp\n\t"
".cfi_adjust_cfa_offset -8\n\t"
/* Contract of __kernel_vsyscall() is that even callee-save
* registers aren't touched, so we restore everything here. */
"popl %ebx\n\t"
".cfi_adjust_cfa_offset -4\n\t"
".cfi_restore %ebx\n\t"
"popl %ecx\n\t"
".cfi_adjust_cfa_offset -4\n\t"
".cfi_restore %ecx\n\t"
"popl %edx\n\t"
".cfi_adjust_cfa_offset -4\n\t"
".cfi_restore %edx\n\t"
"popl %esi\n\t"
".cfi_adjust_cfa_offset -4\n\t"
".cfi_restore %esi\n\t"
"popl %edi\n\t"
".cfi_adjust_cfa_offset -4\n\t"
".cfi_restore %edi\n\t"
"popl %ebp\n\t"
".cfi_adjust_cfa_offset -4\n\t"
".cfi_restore %ebp\n\t"
/* Return to the caller of *|__kernel_vsyscall()|*, because
* the monkeypatch jumped to us. */
"ret\n\t"
".cfi_endproc\n\t"
".size _vsyscall_hook_trampoline, .-_vsyscall_hook_trampoline\n\t");
static void* get_vsyscall_hook_trampoline(void)
{
void *ret;
__asm__ __volatile__(
"call .L_get_vsyscall_hook_trampoline__pic_helper\n\t"
".L_get_vsyscall_hook_trampoline__pic_helper: pop %0\n\t"
"addl $(_vsyscall_hook_trampoline - .L_get_vsyscall_hook_trampoline__pic_helper),%0"
: "=a"(ret));
return ret;
}
/**
* Do what's necessary to set up buffers for the caller.
* |untraced_syscall_ip| lets rr know where our untraced syscalls will
* originate from. |addr| is the address of the control socket the
* child expects to connect to. |msg| is a pre-prepared IPC that can
* be used to share fds; |fdptr| is a pointer to the control-message
* data buffer where the fd number being shared will be stored.
* |args_vec| provides the tracer with preallocated space to make
* socketcall syscalls.
*
* Return a pointer to the syscallbuf (with an initialized header
* including the available size), if syscallbuf is enabled.
*
* This is a "magic" syscall implemented by rr.
*/
static void rrcall_init_buffers(struct rrcall_init_buffers_params* args)
{
sigset_t mask;
enter_signal_critical_section(&mask);
traced_syscall1(SYS_rrcall_init_buffers, args);
exit_signal_critical_section(&mask);
}
/**
* Monkeypatch |__kernel_vsyscall()| to jump into
* |vdso_hook_trampoline|.
*/
static void rrcall_monkeypatch_vdso(void* vdso_hook_trampoline)
{
sigset_t mask;
enter_signal_critical_section(&mask);
traced_syscall1(SYS_rrcall_monkeypatch_vdso, vdso_hook_trampoline);
exit_signal_critical_section(&mask);
}
/**
* Install the seccomp-bpf that generates trace traps for all syscalls
* other than those made through _untraced_syscall_entry_point().
*/
static void install_syscall_filter(void)
{
void* untraced_syscall_start = get_untraced_syscall_entry_point();
struct sock_filter filter[] = {
/* Allow all system calls from our protected_call
* callsite */
ALLOW_SYSCALLS_FROM_CALLSITE((uintptr_t)untraced_syscall_start),
/* All the rest are handled in rr */
TRACE_PROCESS,
};
struct sock_fprog prog = {
.len = (unsigned short)(sizeof(filter)/sizeof(filter[0])),
.filter = filter,
};
debug("Initializing syscall buffer: protected_call_start = %p",
untraced_syscall_start);
if (traced_prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
fatal("prctl(NO_NEW_PRIVS) failed, SECCOMP_FILTER is not available: your kernel is too old. Use `record -n` to disable the filter.");
}
/* Note: the filter is installed only for record. This call
* will be emulated in the replay */
if (traced_prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER,
(uintptr_t)&prog, 0, 0)) {
fatal("prctl(SECCOMP) failed, SECCOMP_FILTER is not available: your kernel is too old. Use `record -n` to disable the filter.");
}
/* anything that happens from this point on gets filtered! */
}
/**
* Return a counter that generates a signal targeted at this task
* every time the task is descheduled |nr_descheds| times.
*/
static int open_desched_event_counter(size_t nr_descheds)
{
struct perf_event_attr attr;
int fd;
struct f_owner_ex own;
memset(&attr, 0, sizeof(attr));
attr.size = sizeof(attr);
attr.type = PERF_TYPE_SOFTWARE;
attr.config = PERF_COUNT_SW_CONTEXT_SWITCHES;
attr.disabled = 1;
attr.sample_period = nr_descheds;
fd = traced_perf_event_open(&attr, 0/*self*/, -1/*any cpu*/, -1, 0);
if (0 > fd) {
fatal("Failed to perf_event_open(cs, period=%u)", nr_descheds);
}
if (traced_fcntl(fd, F_SETFL, O_ASYNC)) {
fatal("Failed to fcntl(O_ASYNC) the desched counter");
}
own.type = F_OWNER_TID;
own.pid = traced_gettid();
if (traced_fcntl(fd, F_SETOWN_EX, &own)) {
fatal("Failed to fcntl(SETOWN_EX) the desched counter to this");
}
if (traced_fcntl(fd, F_SETSIG, SYSCALLBUF_DESCHED_SIGNAL)) {
fatal("Failed to fcntl(SETSIG, %d) the desched counter",
SYSCALLBUF_DESCHED_SIGNAL);
}
return fd;
}
static void set_up_buffer(void)
{
struct sockaddr_un addr;
struct msghdr msg;
struct iovec data;
int msgbuf;
struct cmsghdr* cmsg;
int* msg_fdptr;
int* cmsg_fdptr;
char cmsgbuf[CMSG_SPACE(sizeof(*cmsg_fdptr))];
struct socketcall_args args_vec;
struct rrcall_init_buffers_params args;
assert(!buffer);
/* NB: we want this setup emulated during replay. */
if (buffer_enabled) {
desched_counter_fd = open_desched_event_counter(1);
}
/* Prepare arguments for rrcall. We do this in the tracee
* just to avoid some hairy IPC to set up the arguments
* remotely from the tracer; this isn't strictly
* necessary. */
prepare_syscallbuf_socket_addr(&addr, traced_gettid());
memset(&msg, 0, sizeof(msg));
msg_fdptr = &msgbuf;
data.iov_base = msg_fdptr;
data.iov_len = sizeof(msgbuf);
msg.msg_iov = &data;
msg.msg_iovlen = 1;
msg.msg_control = cmsgbuf;
msg.msg_controllen = sizeof(cmsgbuf);
cmsg = CMSG_FIRSTHDR(&msg);
cmsg->cmsg_len = CMSG_LEN(sizeof(*cmsg_fdptr));
cmsg->cmsg_level = SOL_SOCKET;
cmsg->cmsg_type = SCM_RIGHTS;
cmsg_fdptr = (int*)CMSG_DATA(cmsg);
/* Set the "fd parameter" in the message buffer, which we send
* to let the other side know the local fd number we shared to
* it. */
*msg_fdptr = desched_counter_fd;
/* Set the "fd parameter" in the cmsg buffer, which is the one
* the kernel parses, dups, then sets to the fd number
* allocated in the other process. */
*cmsg_fdptr = desched_counter_fd;
args.syscallbuf_enabled = buffer_enabled;
args.traced_syscall_ip = get_traced_syscall_entry_point();
args.untraced_syscall_ip = get_untraced_syscall_entry_point();
args.sockaddr = &addr;
args.msg = &msg;
args.fdptr = cmsg_fdptr;
args.args_vec = &args_vec;
/* Trap to rr: let the magic begin! We've prepared the buffer
* so that it's immediately ready to be sendmsg()'d to rr to
* share the desched counter to it (under rr's control). rr
* can further use the buffer to share more fd's to us.
*
* If the desched signal is currently blocked, then the tracer
* will clear our TCB guard and we won't be able to buffer
* syscalls. But the tracee will set the guard when (or if)
* the signal is unblocked. */
rrcall_init_buffers(&args);
/* rr initializes the buffer header. */
buffer = args.syscallbuf_ptr;
}
/**
* Initialize thread-local buffering state, if enabled.
*/
static void init_thread(void)
{
assert(process_inited);
assert(!thread_inited);
if (!buffer_enabled) {
thread_inited = 1;
return;
}
set_up_buffer();
thread_inited = 1;
}
/**
* After a fork(), we retain a CoW mapping of our parent's syscallbuf.
* That's bad, because we don't want to use that buffer. So drop the
* parent's copy and reinstall our own.
*
* FIXME: this "leaks" the parent's old copy in our address space.
*/
static void post_fork_child(void)
{
buffer = NULL;
thread_inited = 0;
init_thread();
}
/**
* Initialize process-global buffering state, if enabled.
*/
static void __attribute__((constructor))
init_process(void)
{
assert(!process_inited);
if (getenv("_RR_CHECK_PRELOAD")) {
/* The tracer parent is just checking that we loaded.
* We did, so return a success code. */
exit(0);
}
real_pthread_create = dlsym(RTLD_NEXT, "pthread_create");
real_pthread_mutex_lock = dlsym(RTLD_NEXT, "pthread_mutex_lock");
real_pthread_mutex_timedlock = dlsym(RTLD_NEXT, "pthread_mutex_timedlock");
real_pthread_mutex_trylock = dlsym(RTLD_NEXT, "pthread_mutex_trylock");
buffer_enabled = !!getenv(SYSCALLBUF_ENABLED_ENV_VAR);
if (!buffer_enabled) {
debug("Syscall buffering is disabled");
process_inited = 1;
return;
}
pthread_atfork(NULL, NULL, post_fork_child);
install_syscall_filter();
rrcall_monkeypatch_vdso(get_vsyscall_hook_trampoline());
process_inited = 1;
init_thread();
}
/**
* In a thread newly created by |pthread_create()|, first initialize
* thread-local internal rr data, then trampoline into the user's
* thread function.
*/
struct thread_func_data {
void* (*start_routine) (void*);
void* arg;
};
static void* thread_trampoline(void* arg)
{
struct thread_func_data* data = arg;
void* ret;
init_thread();
ret = data->start_routine(data->arg);
/* We don't want glibc re-entering us during thread cleanup. */
buffer = NULL;
free(data);
return ret;
}
/**
* Interpose |pthread_create()| so that we can use a custom trampoline
* function (see above) that initializes rr thread-local data for new
* threads.
*
* This is a wrapper of |pthread_create()|, but not like the ones
* below: we don't wrap |pthread_create()| in order to buffer its
* syscalls, rather in order to initialize rr thread data.
*/
int pthread_create(pthread_t* thread, const pthread_attr_t* attr,
void* (*start_routine) (void*), void* arg)
{
struct thread_func_data* data = malloc(sizeof(*data));
void* saved_buffer = buffer;
int ret;
data->start_routine = start_routine;
data->arg = arg;
/* Don't let the new thread use our TLS pointer. */
buffer = NULL;
ret = real_pthread_create(thread, attr, thread_trampoline, data);
buffer = saved_buffer;
return ret;
}
#define PTHREAD_MUTEX_ELISION_NP 256
#define PTHREAD_MUTEX_NO_ELISION_NP 512
static void disable_elision_for_mutex(pthread_mutex_t* mutex)
{
/* Cancel explicitly-set elision requests */
mutex->__data.__kind &= ~PTHREAD_MUTEX_ELISION_NP;
/* Prevent auto-enabling of elision */
mutex->__data.__kind |= PTHREAD_MUTEX_NO_ELISION_NP;
}
int pthread_mutex_lock(pthread_mutex_t* mutex)
{
/* Prevent use of lock elision; Haswell's TSX/RTM features used by
lock elision increment the rbc perf counter for instructions which
are later rolled back if the transaction fails. */
disable_elision_for_mutex(mutex);
return real_pthread_mutex_lock(mutex);
}
int pthread_mutex_timedlock(pthread_mutex_t* mutex,
const struct timespec *abstime)
{
/* Prevent use of lock elision; Haswell's TSX/RTM features used by
lock elision increment the rbc perf counter for instructions which
are later rolled back if the transaction fails. */
disable_elision_for_mutex(mutex);
return real_pthread_mutex_timedlock(mutex, abstime);
}
int pthread_mutex_trylock(pthread_mutex_t* mutex)
{
/* Prevent use of lock elision; Haswell's TSX/RTM features used by
lock elision increment the rbc perf counter for instructions which
are later rolled back if the transaction fails. */
disable_elision_for_mutex(mutex);
return real_pthread_mutex_trylock(mutex);
}
/**
* vsyscall hooks start here.
*
* !!! NBB !!!: from here on, all code that executes within the
* critical sections of transactions *MUST KEEP $ip IN THE SYSCALLBUF
* CODE*. That means no calls into libc, even for innocent-looking
* functions like |memcpy()|.
*
* How syscall hooks operate:
*
* 1. The rr tracer monkey-patches __kernel_vsyscall() to jump to
* _vsyscall_hook_trampoline() above.
* 2. When a call is made to __kernel_vsyscall(), it jumps to
* _vsyscall_hook_trampoline(), where the syscall params are
* packaged up into a call to vsyscall_hook() below.
* 3. vsyscall_hook() dispatches to a syscall processor function.
* 4. The syscall processor prepares a new record in the buffer. See
* struct syscallbuf_record for record fields. If the buffer runs
* out of space, the processor function aborts and makes a traced
* syscall, trapping to rr. rr then flushes the buffer. Records
* are directly saved to trace, and a buffer-flush event is
* recorded without execution info because it's a synthetic event.
* 5. Then, the syscall processor redirects all potential output
* for the syscall to the record (and corrects the overall size of
* the record while it does so).
* 6. The syscall is invoked through a asm helper that does *not*
* ptrace-trap to rr.
* 7. The syscall output, written on the buffer, is copied to the
* original pointers provided by the user. Take notice that this
* part saves us the injection of the data on replay, as we only
* need to push the data to the buffer and the wrapper code will
* copy it to the user address for us.
* 8. The return value and overall size are saved to the record.
*/
/**
* Call this and save the result at the start of every system call we
* want to buffer. The result is a pointer into the record space. You
* can add to this pointer to allocate space in the trace record.
* However, do not read or write through this pointer until
* start_commit_syscall() has been called. And you *must* call
* start_commit_syscall() after this is called, otherwise buffering
* state will be inconsistent between syscalls.
*
* See |sys_clock_gettime()| for a simple example of how this helper
* should be used to buffer outparam data.
*/
static void* prep_syscall(void)
{
if (!buffer) {
return NULL;
}
if (buffer_hdr()->locked) {
/* We may be reentering via a signal handler. Return
* an invalid pointer. */
return NULL;
}
/* We don't need to worry about a race between testing
* |locked| and setting it here. rr recording is responsible
* for ensuring signals are not delivered during
* syscall_buffer prologue and epilogue code.
*
* XXX except for synchronous signals generated in the syscall
* buffer code, while reading/writing user pointers */
buffer_hdr()->locked = 1;
/* "Allocate" space for a new syscall record, not including
* syscall outparam data. */
return buffer_last() + sizeof(struct syscallbuf_record);
}
static void arm_desched_event(void)
{
/* Don't trace the ioctl; doing so would trigger a flushing
* ptrace trap, which is exactly what this code is trying to
* avoid! :) Although we don't allocate extra space for these
* ioctl's, we do record that we called them; the replayer
* knows how to skip over them. */
if (untraced_syscall3(SYS_ioctl, desched_counter_fd,
PERF_EVENT_IOC_ENABLE, 0)) {
fatal("Failed to ENABLE counter %d", desched_counter_fd);
}
}
static void disarm_desched_event(void)
{
/* See above. */
if (untraced_syscall3(SYS_ioctl, desched_counter_fd,
PERF_EVENT_IOC_DISABLE, 0)) {
fatal("Failed to DISABLE counter %d", desched_counter_fd);
}
}
/**
* Return 1 if it's ok to proceed with buffering this system call.
* Return 0 if we should trace the system call.
* This must be checked before proceeding with the buffered system call.
*/
/* (Negative numbers so as to not be valid syscall numbers, in case
* the |int| arguments below are passed in the wrong order.) */
enum { MAY_BLOCK = -1, WONT_BLOCK = -2 };