/
scheduler.cpp
411 lines (351 loc) · 16.7 KB
/
scheduler.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
/** $glic$
* Copyright (C) 2012-2015 by Massachusetts Institute of Technology
* Copyright (C) 2010-2013 by The Board of Trustees of Stanford University
* Copyright (C) 2011 Google Inc.
*
* This file is part of zsim.
*
* zsim is free software; you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, version 2.
*
* If you use this software in your research, we request that you reference
* the zsim paper ("ZSim: Fast and Accurate Microarchitectural Simulation of
* Thousand-Core Systems", Sanchez and Kozyrakis, ISCA-40, June 2013) as the
* source of the simulator in any publications that use this software, and that
* you send us a citation of your work.
*
* zsim is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along with
* this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include "scheduler.h"
#include <fstream>
#include <regex>
#include <sys/stat.h>
#include "config.h" // for ParseList
#include "pin.H"
#include "process_tree.h"
#include "profile_stats.h"
#include "str.h"
#include "virt/syscall_name.h"
//The scheduler class started simple, but at some point having it all in the header is too ridiculous. Migrate non perf-intensive calls here! (all but sync, really)
#define WATCHDOG_INTERVAL_USEC (50)
#define WATCHDOG_MAX_MULTIPLER (40) //50us-2ms waits
#define WATCHDOG_STALL_THRESHOLD (100)
//#define DEBUG_FL(args...) info(args)
#define DEBUG_FL(args...)
//#define DEBUG_FUTEX(args...) info(args)
#define DEBUG_FUTEX(args...)
// Unlike glibc's sleep functions suck, this ensures guaranteed minimum sleep time
static void TrueSleep(uint32_t usecs) {
struct timespec req;
struct timespec rem;
req.tv_sec = usecs/1000000;
req.tv_nsec = (usecs*1000) % 1000000000;
while (req.tv_sec != 0 || req.tv_nsec != 0) {
int res = syscall(SYS_nanosleep, &req, &rem); //we don't call glibc's nanosleep because errno is not thread-safe in pintools.
if (res == 0) break;
req = rem;
if (res != -EINTR && res != 0) panic("nanosleep() returned an unexpected error code %d", res);
//info("nanosleep() interrupted!");
}
}
/* Hacky way to figure out if a thread is sleeping on a certain futex.
*
* Uses /proc/<pid>/task/<tid>/syscall, which is only set when the process is
* actually sleeping on the syscall, not just in the kernel (see Linux kernel
* docs). This interface has been available since ~2008.
*/
bool IsSleepingInFutex(uint32_t linuxPid, uint32_t linuxTid, uintptr_t futexAddr) {
std::string fname = "/proc/" + Str(linuxPid) + "/task/" + Str(linuxTid) + "/syscall";
std::ifstream fs(fname);
if (!fs.is_open()) {
warn("Could not open %s", fname.c_str());
return false;
}
std::stringstream ss;
ss << fs.rdbuf();
fs.close();
std::vector<std::string> argList = ParseList<std::string>(ss.str());
bool match = argList.size() >= 2 &&
strtoul(argList[0].c_str(), nullptr, 0) == SYS_futex &&
(uintptr_t)strtoul(argList[1].c_str(), nullptr, 0) == futexAddr;
//info("%s | %s | SYS_futex = %d futexAddr = 0x%lx | match = %d ", ss.str().c_str(), Str(argList).c_str(), SYS_futex, futexAddr, match);
return match;
}
void Scheduler::watchdogThreadFunc() {
info("Started scheduler watchdog thread");
uint64_t lastPhase = 0;
int multiplier = 1;
uint64_t lastMs = 0;
uint64_t fakeLeaveStalls = 0;
while (true) {
TrueSleep(multiplier*WATCHDOG_INTERVAL_USEC);
if (zinfo->terminationConditionMet) {
// Synchronize to avoid racing with EndOfPhaseActions code
// (zinfo->terminationConditionMet is set on EndOfPhaseActions,
// which has schedLock held, we must let it finish)
futex_lock(&schedLock);
info("Terminating scheduler watchdog thread");
futex_unlock(&schedLock);
SimEnd();
}
//Fastpath (unlocked, benign read races, only modifies local state)
if (lastPhase != curPhase && pendingPidCleanups.size() == 0) {
lastPhase = curPhase;
fakeLeaveStalls = 0;
if (multiplier < WATCHDOG_MAX_MULTIPLER) multiplier++;
continue;
}
//if (lastPhase == curPhase && scheduledThreads == outQueue.size() && !sleepQueue.empty()) info("Mult %d curPhase %ld", multiplier, curPhase);
futex_lock(&schedLock);
if (lastPhase == curPhase && !fakeLeaves.empty() && (fakeLeaves.front()->th->futexJoin.action != FJA_WAKE)) {
if (++fakeLeaveStalls >= WATCHDOG_STALL_THRESHOLD) {
info("Detected possible stall due to fake leaves (%ld current)", fakeLeaves.size());
// Uncomment to print all leaves
FakeLeaveInfo* pfl = fakeLeaves.front();
while (pfl) {
info(" [%d/%d] %s (%d) @ 0x%lx", getPid(pfl->th->gid), getTid(pfl->th->gid), GetSyscallName(pfl->syscallNumber), pfl->syscallNumber, pfl->pc);
pfl = pfl->next;
}
// Trigger a leave() on the first process, if the process's blacklist regex allows it
FakeLeaveInfo* fl = fakeLeaves.front();
ThreadInfo* th = fl->th;
uint32_t pid = getPid(th->gid);
uint32_t tid = getTid(th->gid);
uint32_t cid = th->cid;
const g_string& sbRegexStr = zinfo->procArray[pid]->getSyscallBlacklistRegex();
std::regex sbRegex(sbRegexStr.c_str());
if (std::regex_match(GetSyscallName(fl->syscallNumber), sbRegex)) {
// If this is the last leave we catch, it is the culprit for sure -> blacklist it
// Over time, this will blacklist every blocking syscall
// The root reason for being conservative though is that we don't have a sure-fire
// way to distinguish IO waits from truly blocking syscalls (TODO)
if (fakeLeaves.size() == 1) {
info("Blacklisting from future fake leaves: [%d] %s @ 0x%lx | arg0 0x%lx arg1 0x%lx", pid, GetSyscallName(fl->syscallNumber), fl->pc, fl->arg0, fl->arg1);
blockingSyscalls[pid].insert(fl->pc);
}
uint64_t pc = fl->pc;
do {
finishFakeLeave(th);
futex_unlock(&schedLock);
leave(pid, tid, cid);
futex_lock(&schedLock);
// also do real leave for other threads blocked at the same pc ...
fl = fakeLeaves.front();
if (fl == nullptr || getPid(th->gid) != pid || fl->pc != pc)
break;
th = fl->th;
tid = getTid(th->gid);
cid = th->cid;
// ... until a lower bound on queue size, in order to make blacklist work
} while (fakeLeaves.size() > 8);
} else {
info("Skipping, [%d] %s @ 0x%lx | arg0 0x%lx arg1 0x%lx does not match blacklist regex (%s)",
pid, GetSyscallName(fl->syscallNumber), fl->pc, fl->arg0, fl->arg1, sbRegexStr.c_str());
}
fakeLeaveStalls = 0;
}
} else {
fakeLeaveStalls = 0;
}
if (lastPhase == curPhase && scheduledThreads == outQueue.size() && !sleepQueue.empty()) {
//info("Watchdog Thread: Sleep dep detected...")
int64_t wakeupPhase = sleepQueue.front()->wakeupPhase;
int64_t wakeupCycles = (wakeupPhase - curPhase)*zinfo->phaseLength;
int64_t wakeupUsec = (wakeupCycles > 0)? wakeupCycles/zinfo->freqMHz : 0;
//info("Additional usecs of sleep %ld", wakeupUsec);
if (wakeupUsec > 10*1000*1000) warn("Watchdog sleeping for a long time due to long sleep, %ld secs", wakeupUsec/1000/1000);
futex_unlock(&schedLock);
TrueSleep(WATCHDOG_INTERVAL_USEC + wakeupUsec);
futex_lock(&schedLock);
if (lastPhase == curPhase && scheduledThreads == outQueue.size() && !sleepQueue.empty()) {
ThreadInfo* sth = sleepQueue.front();
uint64_t curMs = curPhase*zinfo->phaseLength/zinfo->freqMHz/1000;
uint64_t endMs = sth->wakeupPhase*zinfo->phaseLength/zinfo->freqMHz/1000;
(void)curMs; (void)endMs; //make gcc happy
if (curMs > lastMs + 1000) {
info("Watchdog Thread: Driving time forward to avoid deadlock on sleep (%ld -> %ld ms)", curMs, endMs);
lastMs += 1000;
}
while (sth->state == SLEEPING) {
idlePhases.inc();
callback(); //sth will eventually get woken up
if (futex_haswaiters(&schedLock)) {
//happens commonly with multiple sleepers and very contended I/O...
//info("Sched: Threads waiting on advance, startPhase %ld curPhase %ld", lastPhase, curPhase);
break;
}
if (zinfo->terminationConditionMet) {
info("Termination condition met inside watchdog thread loop, exiting");
break;
}
}
idlePeriods.inc();
multiplier = 0;
}
}
if (multiplier < WATCHDOG_MAX_MULTIPLER) {
multiplier++;
}
lastPhase = curPhase;
//Lazily clean state of processes that terminated abruptly
//NOTE: For now, we rely on the process explicitly telling us that it's going to terminate.
//We could make this self-checking by periodically checking for liveness of the processes we're supposedly running.
//The bigger problem is that if we get SIGKILL'd, we may not even leave a consistent zsim state behind.
while (pendingPidCleanups.size()) {
std::pair<uint32_t, uint32_t> p = pendingPidCleanups.back();
uint32_t pid = p.first; //the procIdx pid
uint32_t osPid = p.second;
std::stringstream ss;
ss << "/proc/" << osPid;
struct stat dummy;
if (stat(ss.str().c_str(), &dummy) == 0) {
info("[watchdog] Deferring cleanup of pid %d (%d), not finished yet", pid, osPid);
break;
}
pendingPidCleanups.pop_back(); //must happen while we have the lock
futex_unlock(&schedLock);
processCleanup(pid);
futex_lock(&schedLock);
}
if (terminateWatchdogThread) {
futex_unlock(&schedLock);
break;
} else {
futex_unlock(&schedLock);
}
}
info("Finished scheduler watchdog thread");
}
void Scheduler::threadTrampoline(void* arg) {
Scheduler* sched = static_cast<Scheduler*>(arg);
sched->watchdogThreadFunc();
}
void Scheduler::startWatchdogThread() {
PIN_SpawnInternalThread(threadTrampoline, this, 64*1024, nullptr);
}
// Accurate join-leave implementation
void Scheduler::syscallLeave(uint32_t pid, uint32_t tid, uint32_t cid, uint64_t pc, int syscallNumber, uint64_t arg0, uint64_t arg1) {
futex_lock(&schedLock);
uint32_t gid = getGid(pid, tid);
ThreadInfo* th = contexts[cid].curThread;
assert(th->gid == gid);
assert_msg(th->cid == cid, "%d != %d", th->cid, cid);
assert(th->state == RUNNING);
assert_msg(pid < blockingSyscalls.size(), "%d >= %ld?", pid, blockingSyscalls.size());
bool blacklisted = blockingSyscalls[pid].find(pc) != blockingSyscalls[pid].end();
if (blacklisted || th->markedForSleep) {
DEBUG_FL("%s @ 0x%lx calling leave(), reason: %s", GetSyscallName(syscallNumber), pc, blacklisted? "blacklist" : "sleep");
futex_unlock(&schedLock);
leave(pid, tid, cid);
} else {
DEBUG_FL("%s @ 0x%lx skipping leave()", GetSyscallName(syscallNumber), pc);
FakeLeaveInfo* si = new FakeLeaveInfo(pc, th, syscallNumber, arg0, arg1);
fakeLeaves.push_back(si);
// FIXME(dsm): zsim.cpp's SyscallEnter may be checking whether we are in a syscall and not calling us.
// If that's the case, this would be stale, which may lead to some false positives/negatives
futex_unlock(&schedLock);
}
}
/* Wake/wait matching code */
// External interface, must be non-blocking
void Scheduler::notifyFutexWakeStart(uint32_t pid, uint32_t tid, uint32_t maxWakes) {
futex_lock(&schedLock);
ThreadInfo* th = gidMap[getGid(pid, tid)];
DEBUG_FUTEX("[%d/%d] wakeStart max %d", pid, tid, maxWakes);
assert(th->futexJoin.action == FJA_NONE);
// Programs sometimes call FUTEX_WAIT with maxWakes = UINT_MAX to wake
// everyone waiting on it; we cap to a reasonably high number to avoid
// overflows on maxAllowedFutexWakeups
maxWakes = MIN(maxWakes, 1<<24 /*16M wakes*/);
maxAllowedFutexWakeups += maxWakes;
th->futexJoin.maxWakes = maxWakes;
futex_unlock(&schedLock);
}
void Scheduler::notifyFutexWakeEnd(uint32_t pid, uint32_t tid, uint32_t wokenUp) {
futex_lock(&schedLock);
ThreadInfo* th = gidMap[getGid(pid, tid)];
DEBUG_FUTEX("[%d/%d] wakeEnd woken %d", pid, tid, wokenUp);
th->futexJoin.action = FJA_WAKE;
th->futexJoin.wokenUp = wokenUp;
futex_unlock(&schedLock);
}
void Scheduler::notifyFutexWaitWoken(uint32_t pid, uint32_t tid) {
futex_lock(&schedLock);
ThreadInfo* th = gidMap[getGid(pid, tid)];
DEBUG_FUTEX("[%d/%d] waitWoken", pid, tid);
th->futexJoin = {FJA_WAIT, 0, 0};
futex_unlock(&schedLock);
}
// Internal, called with schedLock held
void Scheduler::futexWakeJoin(ThreadInfo* th) { // may release schedLock
assert(th->futexJoin.action == FJA_WAKE);
uint32_t maxWakes = th->futexJoin.maxWakes;
uint32_t wokenUp = th->futexJoin.wokenUp;
// Adjust allowance
assert(maxWakes <= maxAllowedFutexWakeups);
assert(wokenUp <= maxWakes);
maxAllowedFutexWakeups -= (maxWakes - wokenUp);
assert(unmatchedFutexWakeups <= maxAllowedFutexWakeups); // should panic...
DEBUG_FUTEX("Futex wake matching %d %d", unmatchedFutexWakeups, maxAllowedFutexWakeups);
while (true) {
futex_unlock(&schedLock);
uint64_t startNs = getNs();
uint32_t iters = 0;
while (wokenUp > unmatchedFutexWakeups) {
TrueSleep(10*(1 + iters)); // linear backoff, start small but avoid overwhelming the OS with short sleeps
iters++;
uint64_t curNs = getNs();
if (curNs - startNs > (2L<<31L) /* ~2s */) {
futex_lock(&schedLock);
warn("Futex wake matching failed (%d/%d) (external/ff waiters?)", unmatchedFutexWakeups, wokenUp);
unmatchedFutexWakeups = 0;
maxAllowedFutexWakeups -= wokenUp;
return;
}
}
futex_lock(&schedLock);
// Recheck after acquire, may have concurrent wakes here
if (wokenUp <= unmatchedFutexWakeups) {
unmatchedFutexWakeups -= wokenUp;
maxAllowedFutexWakeups -= wokenUp;
break;
}
}
DEBUG_FUTEX("Finished futex wake matching");
}
void Scheduler::futexWaitJoin(ThreadInfo* th) {
assert(th->futexJoin.action == FJA_WAIT);
if (unmatchedFutexWakeups >= maxAllowedFutexWakeups) {
warn("External futex wakes? (%d/%d)", unmatchedFutexWakeups, maxAllowedFutexWakeups);
} else {
unmatchedFutexWakeups++;
}
}
void Scheduler::finishFakeLeave(ThreadInfo* th) {
assert(th->fakeLeave);
DEBUG_FL("%s (%d) @ 0x%lx finishFakeLeave()", GetSyscallName(th->fakeLeave->syscallNumber), th->fakeLeave->syscallNumber, th->fakeLeave->pc);
assert_msg(th->state == RUNNING, "gid 0x%x invalid state %d", th->gid, th->state);
FakeLeaveInfo* si = th->fakeLeave;
fakeLeaves.remove(si);
delete si;
assert(th->fakeLeave == nullptr);
}
void Scheduler::waitUntilQueued(ThreadInfo* th) {
uint64_t startNs = getNs();
uint32_t sleepUs = 1;
while(!IsSleepingInFutex(th->linuxPid, th->linuxTid, (uintptr_t)&schedLock)) {
TrueSleep(sleepUs++); // linear backoff, start small but avoid overwhelming the OS with short sleeps
uint64_t curNs = getNs();
if (curNs - startNs > (2L<<31L) /* ~2s */) {
warn("waitUntilQueued for pid %d tid %d timed out", getPid(th->gid), getTid(th->gid));
return;
}
}
}