-
Notifications
You must be signed in to change notification settings - Fork 0
/
crash_reporting.go
370 lines (330 loc) · 11.3 KB
/
crash_reporting.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
// Copyright 2017 The Cockroach Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package log
import (
"fmt"
"os"
"path/filepath"
"runtime"
"runtime/debug"
"strings"
"syscall"
"time"
raven "github.com/getsentry/raven-go"
"github.com/pkg/errors"
"golang.org/x/net/context"
"github.com/cockroachdb/cockroach/pkg/build"
"github.com/cockroachdb/cockroach/pkg/settings"
"github.com/cockroachdb/cockroach/pkg/util/caller"
"github.com/cockroachdb/cockroach/pkg/util/envutil"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
)
var (
// DiagnosticsReportingEnabled wraps "diagnostics.reporting.enabled".
//
// "diagnostics.reporting.enabled" enables reporting of metrics related to a
// node's storage (number, size and health of ranges) back to CockroachDB.
// Collecting this data from production clusters helps us understand and improve
// how our storage systems behave in real-world use cases.
//
// Note: while the setting itself is actually defined with a default value of
// `false`, it is usually automatically set to `true` when a cluster is created
// (or is migrated from a earlier beta version). This can be prevented with the
// env var COCKROACH_SKIP_ENABLING_DIAGNOSTIC_REPORTING.
//
// Doing this, rather than just using a default of `true`, means that a node
// will not errantly send a report using a default before loading settings.
DiagnosticsReportingEnabled = settings.RegisterBoolSetting(
"diagnostics.reporting.enabled",
"enable reporting diagnostic metrics to cockroach labs",
false,
)
// CrashReports wraps "diagnostics.reporting.send_crash_reports".
CrashReports = settings.RegisterBoolSetting(
"diagnostics.reporting.send_crash_reports",
"send crash and panic reports",
true,
)
// startTime records when the process started so that crash reports can
// include the server's uptime as an extra tag.
startTime = timeutil.Now()
)
// RecoverAndReportPanic can be invoked on goroutines that run with
// stderr redirected to logs to ensure the user gets informed on the
// real stderr a panic has occurred.
func RecoverAndReportPanic(ctx context.Context, sv *settings.Values) {
if r := recover(); r != nil {
// The call stack here is usually:
// - ReportPanic
// - RecoverAndReport
// - panic.go
// - panic()
// so ReportPanic should pop four frames.
ReportPanic(ctx, sv, r, 4)
panic(r)
}
}
// SafeMessager is implemented by objects which have a way of representing
// themselves suitably redacted for anonymized reporting.
type SafeMessager interface {
SafeMessage() string
}
// A SafeType panic can be reported verbatim, i.e. does not leak information.
// A nil `*SafeType` is not valid for use and may cause panics.
type SafeType struct {
V interface{}
}
var _ SafeMessager = SafeType{}
// SafeMessage implements SafeMessager.
func (st SafeType) SafeMessage() string {
return fmt.Sprintf("%v", st.V)
}
// Safe constructs a SafeType.
func Safe(v interface{}) SafeType {
return SafeType{V: v}
}
// ReportPanic reports a panic has occurred on the real stderr.
func ReportPanic(ctx context.Context, sv *settings.Values, r interface{}, depth int) {
Shout(ctx, Severity_ERROR, "a panic has occurred!")
if stderrRedirected {
// We do not use Shout() to print the panic details here, because
// if stderr is not redirected (e.g. when logging to file is
// disabled) Shout() would copy its argument to stderr
// unconditionally, and we don't want that: Go's runtime system
// already unconditonally copies the panic details to stderr.
// Instead, we copy manually the details to stderr, only when stderr
// is redirected to a file otherwise.
fmt.Fprintf(OrigStderr, "%v\n\n%s\n", r, debug.Stack())
} else {
// If stderr is not redirected, then Go's runtime will only print
// out the panic details to the original stderr, and we'll miss a copy
// in the log file. Produce it here.
logging.printPanicToFile(r)
}
SendCrashReport(ctx, sv, depth+1, "", []interface{}{r})
// Ensure that the logs are flushed before letting a panic
// terminate the server.
Flush()
}
var crashReportURL = func() string {
var defaultURL string
if build.IsRelease() {
defaultURL = "https://ignored:ignored@errors.cockroachdb.com/sentry"
}
return envutil.EnvOrDefaultString("COCKROACH_CRASH_REPORTS", defaultURL)
}()
// SetupCrashReporter sets the crash reporter info.
func SetupCrashReporter(ctx context.Context, cmd string) {
if err := raven.SetDSN(crashReportURL); err != nil {
panic(errors.Wrap(err, "failed to setup crash reporting"))
}
if cmd == "start" {
cmd = "server"
}
info := build.GetInfo()
raven.SetRelease(info.Tag)
raven.SetEnvironment(info.Type)
raven.SetTagsContext(map[string]string{
"cmd": cmd,
"platform": info.Platform,
"distribution": info.Distribution,
"rev": info.Revision,
"goversion": info.GoVersion,
})
}
var crdbPaths = []string{
"github.com/cockroachdb/cockroach",
"github.com/coreos/etcd/raft",
}
func uptimeTag(now time.Time) string {
uptime := now.Sub(startTime)
switch {
case uptime < 1*time.Second:
return "<1s"
case uptime < 10*time.Second:
return "<10s"
case uptime < 1*time.Minute:
return "<1m"
case uptime < 10*time.Minute:
return "<10m"
case uptime < 1*time.Hour:
return "<1h"
case uptime < 10*time.Hour:
return "<10h"
default:
daysUp := int(uptime / (24 * time.Hour))
return fmt.Sprintf("<%dd", daysUp+1)
}
}
type safeError struct {
message string
}
func (e *safeError) Error() string {
return e.message
}
// redact returns a redacted version of the supplied item that is safe to use in
// anonymized reporting.
func redact(r interface{}) string {
typAnd := func(i interface{}, text string) string {
type stackTracer interface {
StackTrace() errors.StackTrace
}
typ := fmt.Sprintf("%T", i)
if e, ok := i.(stackTracer); ok {
tr := e.StackTrace()
if len(tr) > 0 {
typ = fmt.Sprintf("%v", tr[0]) // prints file:line
}
}
if text == "" {
return typ
}
if strings.HasPrefix(typ, "errors.") {
// Don't bother reporting the type for errors.New() and its
// nondescript siblings. Note that errors coming from pkg/errors
// usually have `typ` overridden to file:line above, so they won't
// hit this path.
return text
}
return typ + ": " + text
}
handle := func(r interface{}) string {
switch t := r.(type) {
case SafeMessager:
return t.SafeMessage()
case error:
// continue below
default:
return typAnd(r, "")
}
// Now that we're looking at an error, see if it's one we can
// deconstruct for maximum (safe) clarity. Separating this from the
// block above ensures that the types below actually implement `error`.
switch t := r.(error).(type) {
case runtime.Error:
return typAnd(t, t.Error())
case syscall.Errno:
return typAnd(t, t.Error())
case *os.SyscallError:
s := redact(t.Err)
return typAnd(t, fmt.Sprintf("%s: %s", t.Syscall, s))
case *os.PathError:
// It hardly matters, but avoid mutating the original.
cpy := *t
t = &cpy
t.Path = "<redacted>"
return typAnd(t, t.Error())
case *os.LinkError:
// It hardly matters, but avoid mutating the original.
cpy := *t
t = &cpy
t.Old, t.New = "<redacted>", "<redacted>"
return typAnd(t, t.Error())
default:
}
// Still an error, but not one we know how to deconstruct.
switch r.(error) {
case context.DeadlineExceeded:
case context.Canceled:
case os.ErrInvalid:
case os.ErrPermission:
case os.ErrExist:
case os.ErrNotExist:
case os.ErrClosed:
default:
// Not a whitelisted sentinel error.
return typAnd(r, "")
}
// Whitelisted sentinel error.
return typAnd(r, r.(error).Error())
}
type causer interface {
Cause() error
}
reportable := handle(r)
if c, ok := r.(causer); ok {
reportable += ": caused by " + redact(c.Cause())
}
return reportable
}
func reportablesToSafeError(depth int, format string, reportables []interface{}) error {
if len(reportables) == 0 {
reportables = []interface{}{"nothing reported"}
}
file := "?"
var line int
if depth > 0 {
file, line, _ = caller.Lookup(depth)
}
redacted := make([]string, 0, len(reportables))
for i := range reportables {
redacted = append(redacted, redact(reportables[i]))
}
reportables = nil
var sep string
// TODO(tschottdorf): it would be nice to massage the format so that all of its verbs are replaced by %v
// (so that we could now call `fmt.Sprintf(newFormat, reportables...)`).
// This isn't trivial. For example, "%ss %.2f %#v %U+%04X %%" would become "%ss %s %s %s %%".
// The logic to do that is known to `fmt.Printf` but we'd have to copy it here.
if format != "" {
sep = " | "
}
err := &safeError{
message: fmt.Sprintf("%s:%d: %s%s%s", filepath.Base(file), line, format, sep, strings.Join(redacted, "; ")),
}
return err
}
// SendCrashReport posts to sentry. The `reportables` is essentially the `args...` in
// `log.Fatalf(format, args...)` (similarly for `log.Fatal`) or `[]interface{}{arg}` in
// `panic(arg)`.
//
// The format string and those items in `reportables` which are a) an error or b) (values of or
// pointers to) `log.Safe` will be used verbatim to construct the error that is reported to sentry.
//
// TODO(dt,knz,sql-team): we need to audit all sprintf'ing of values into the errors and strings
// passed to panic, to ensure raw user data is kept separate and can thus be elided here. For now,
// the type is about all we can assume is safe to report, which combined with file and line info
// should be at least somewhat helpful in telling us where crashes are coming from. We capture the
// full stacktrace below, so we only need the short file and line here help uniquely identify the
// error. Some exceptions, like a runtime.Error, are assumed to be fine as-is.
func SendCrashReport(
ctx context.Context, sv *settings.Values, depth int, format string, reportables []interface{},
) {
if !DiagnosticsReportingEnabled.Get(sv) || !CrashReports.Get(sv) {
return // disabled via settings.
}
if raven.DefaultClient == nil {
return // disabled via empty URL env var.
}
err := reportablesToSafeError(depth+1, format, reportables)
// This is close to inlining raven.CaptureErrorAndWait(), except it lets us
// control the stack depth of the collected trace.
const contextLines = 3
ex := raven.NewException(err, raven.NewStacktrace(depth+1, contextLines, crdbPaths))
packet := raven.NewPacket(err.Error(), ex)
// Avoid leaking the machine's hostname by injecting the literal "<redacted>".
// Otherwise, raven.Client.Capture will see an empty ServerName field and
// automatically fill in the machine's hostname.
packet.ServerName = "<redacted>"
tags := map[string]string{
"uptime": uptimeTag(timeutil.Now()),
}
eventID, ch := raven.DefaultClient.Capture(packet, tags)
select {
case <-ch:
Shout(ctx, Severity_ERROR, "Reported as error "+eventID)
case <-time.After(10 * time.Second):
Shout(ctx, Severity_ERROR, "Time out trying to submit crash report")
}
}