Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

extend self-test log processing #151

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,17 @@ var (
},
nil,
)
metricDeviceSelfTest = prometheus.NewDesc(
"smartctl_device_self_test_log_seconds",
"Device SMART self test log execution lifetime seconds",
[]string{
"device",
"self_test_log_type",
"self_test_passed",
},
nil,
)

metricDeviceSelfTestLogCount = prometheus.NewDesc(
"smartctl_device_self_test_log_count",
"Device SMART self test log count",
Expand Down
2 changes: 1 addition & 1 deletion readjson.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ func readFakeSMARTctl(logger log.Logger, device string) gjson.Result {
// Get json from smartctl and parse it
func readSMARTctl(logger log.Logger, device string) (gjson.Result, bool) {
level.Debug(logger).Log("msg", "Collecting S.M.A.R.T. counters", "device", device)
out, err := exec.Command(*smartctlPath, "--json", "--info", "--health", "--attributes", "--tolerance=verypermissive", "--nocheck=standby", "--format=brief", "--log=error", device).Output()
out, err := exec.Command(*smartctlPath, "--json", "--info", "--health", "--attributes", "--tolerance=verypermissive", "--nocheck=standby", "--format=brief", "--log=error", "--log=selftest", device).Output()
if err != nil {
level.Warn(logger).Log("msg", "S.M.A.R.T. output reading", "err", err, "device", device)
}
Expand Down
46 changes: 46 additions & 0 deletions smartctl.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ package main

import (
"fmt"
"strconv"
"strings"

"github.com/go-kit/log"
Expand Down Expand Up @@ -69,6 +70,7 @@ func (smart *SMARTctl) Collect() {
smart.mineDeviceSCTStatus()
smart.mineDeviceStatistics()
smart.mineDeviceErrorLog()
smart.mineDeviceSelfTest()
smart.mineDeviceSelfTestLog()
smart.mineDeviceERC()
smart.minePercentageUsed()
Expand Down Expand Up @@ -399,6 +401,50 @@ func (smart *SMARTctl) mineDeviceErrorLog() {
}
}

func (smart *SMARTctl) mineDeviceSelfTest() {
validTypes := map[int]string{
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

from smartctl sources:

  switch (test_type) {
    case 0x00: msgtest = "Offline";            break;
    case 0x01: msgtest = "Short offline";      break;
    case 0x02: msgtest = "Extended offline";   break;
    case 0x03: msgtest = "Conveyance offline"; break;
    case 0x04: msgtest = "Selective offline";  break;
    case 0x7f: msgtest = "Abort offline test"; break;
    case 0x81: msgtest = "Short captive";      break;
    case 0x82: msgtest = "Extended captive";   break;
    case 0x83: msgtest = "Conveyance captive"; break;
    case 0x84: msgtest = "Selective captive";  break;
    default:
      if ((0x40 <= test_type && test_type <= 0x7e) || 0x90 <= test_type)
        msgtest = strprintf("Vendor (0x%02x)", test_type);
      else
        msgtest = strprintf("Reserved (0x%02x)", test_type);
  }

255: "vendor",
129: "short_captive",
2: "long",
1: "short",
}

// assume the table will always be in descending order
processedTypes := make(map[string]bool)

for _, logEntry := range smart.json.Get("ata_smart_self_test_log.standard.table").Array() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should accept either standard or extended. Some args & device combinations only have one of them. The layout of the json struct is the same.

testType := int(logEntry.Get("type.value").Int())
testTime := float64(logEntry.Get("lifetime_hours").Int())
testRunningIndicator := int(logEntry.Get("status.value").Int())
testStatus := strconv.FormatBool(logEntry.Get("status.passed").Bool())

// stick with seconds
testTime = testTime * 60 * 60

// skip running tests
if testRunningIndicator != 0 {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is not correct, from one of my systems:

"status": {
"value": 41,
"string": "Interrupted (host reset)",
"remaining_percent": 90
}

status.passeed is NOT present in this case.

I don't have any SATA drives w/ failing checks to compare presentlyy, but I worry they are also non-zero.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, it's definetly in need of work; also in the smartctl sources:

  std::string msgstat;
  switch (test_status >> 4) {
    case 0x0: msgstat = "Completed without error";       break;
    case 0x1: msgstat = "Aborted by host";               break;
    case 0x2: msgstat = "Interrupted (host reset)";      break;
    case 0x3: msgstat = "Fatal or unknown error";        break;
    case 0x4: msgstat = "Completed: unknown failure";    break;
    case 0x5: msgstat = "Completed: electrical failure"; break;
    case 0x6: msgstat = "Completed: servo/seek failure"; break;
    case 0x7: msgstat = "Completed: read failure";       break;
    case 0x8: msgstat = "Completed: handling damage??";  break;
    case 0xf: msgstat = "Self-test routine in progress"; break;
    default:  msgstat = strprintf("Unknown status (0x%x)", test_status >> 4);
  }

So if it's 0xF then skip it as running; otherwise map the error.

continue
}

logTestType, exists := validTypes[testType]
if !exists {
logTestType = "unknown"
}

if !processedTypes[logTestType] {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is implicitly trusting that the tests appear in newest to oldest order. I don't know if I trust drives enough for that.

smart.ch <- prometheus.MustNewConstMetric(
metricDeviceSelfTest,
prometheus.GaugeValue,
testTime,
smart.device.device,
logTestType,
testStatus,
)
processedTypes[logTestType] = true
}
}
}

func (smart *SMARTctl) mineDeviceSelfTestLog() {
for logType, status := range smart.json.Get("ata_smart_self_test_log").Map() {
smart.ch <- prometheus.MustNewConstMetric(
Expand Down