forked from ebalduf/nagfire
-
Notifications
You must be signed in to change notification settings - Fork 0
/
checkSolidFire.py
373 lines (318 loc) · 13 KB
/
checkSolidFire.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
#!/usr/bin/env python3
# author: Joe McManus joe.mcmanus@solidfire.com, scaleoutSean
# file: checkClusterApi.py
# version: 2.2 2021/05/10
# use: Query SolidFire and NetApp HCI clusters and nodes to feed to Nagios, or stand-alone command line
# coding: utf-8
import requests
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
import base64
import json
import sys
import io
import os.path
import math
import socket
import re
import textwrap
import time
version = "v2.2 2021/05/10"
murl = "/json-rpc/11.0"
# This is a nagios thing, nagionic you might say.
STATE_OK = 0
STATE_WARNING = 1
STATE_CRITICAL = 2
STATE_UNKNOWN = 3
STATE_DEPENDENT = 4
exitStatus = STATE_OK
checkUtilization = 1 # Generate Alerts on the utilization of cluster space
checkSessions = 1 # Generate Alerts on the number of iSCSI sessions
checkDiskUse = 1 # Generate Alerts on disk access
checkClusterFaults = 1 # Generate Alerts on cluster Faults
def printUsage(error):
print(("ERROR: " + error))
print(
("USAGE: " + sys.argv[0] + " (IP|HOSTNAME) PORT USERNAME PASSWORD (mvip|node)"))
sys.exit(STATE_UNKNOWN)
# Check command line options that are passed
def commandLineOptions():
if len(sys.argv) < 6:
printUsage("Incorrect Number of Arguments.")
ip = sys.argv[1]
port = sys.argv[2]
username = sys.argv[3]
password = sys.argv[4]
ipType = sys.argv[5]
if ipType != "mvip" and ipType != "node":
printUsage("Invalid type specified, use node or mvip")
return ip, port, username, password, ipType
# Send requests to the target
def sendRequest(ip, port, murl, username, password, jsonData, ipType):
url = 'https://' + ip + ":" + port + murl
r = requests.post(url,data=jsonData,auth=(username, password),verify=False,timeout=10)
response = r.json()
if r.status_code == 200:
return response
else:
printUsage("Invalid response received.")
# Check for a valid IP
def ipCheck(ip):
pattern = r"\b(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b"
if re.match(pattern, ip):
return True
else:
return False
# Resolve Hostnames
def checkName(hostname):
try:
socket.gethostbyname(hostname)
except:
printUsage("Unable to resolve hostname " + hostname)
# Check if new data has been written to disk
def readwriteCheck(fileName, newUse):
if os.path.isfile(fileName):
try:
f = open(fileName, 'r+')
previousUse = f.readline()
f.seek(0)
f.write(newUse)
f.truncate()
f.close()
except:
printUsage("Unable to open & write to " + fileName +
" check perms or set checkDiskUse=0")
if newUse == "00":
diskUse = "No"
exitStatus = STATE_CRITICAL
elif previousUse == newUse:
diskUse = "No"
exitStatus = STATE_WARNING
else:
diskUse = "Yes"
exitStatus = STATE_OK
else:
try:
f = open(fileName, 'w')
f.write(newUse)
diskUse = "n/a"
f.close()
exitStatus = STATE_UNKNOWN
except:
printUsage("Unable to open & write to " + fileName +
" check perms or set checkDiskUse=0")
return diskUse, exitStatus
# Compare ranges of numbers
def rangeCheck(critical, warning, value):
if value > critical:
exitStatus = STATE_CRITICAL
elif value > warning:
exitStatus = STATE_WARNING
else:
exitStatus = STATE_OK
return exitStatus
# Add a asterisk to values that are in error
def addNote(testResult, exitStatus, value):
if testResult != 0:
value = value + "*"
if testResult > exitStatus:
exitStatus = testResult
return exitStatus, value
# Print a table
def prettyPrint(description, value, width):
# When printing values wider than the second column, split and print them
if len(value) > (width/2):
print(("| " + description.ljust(int(width/2)) + " |"), end=' ')
i = 0
wrapped = textwrap.wrap(value, 29)
for loop in wrapped:
if i == 0:
print((loop + "|".rjust(int(width/2-(len(loop))))))
else:
print(("| ".ljust(int(width/2+2)) + " | " +
loop + "|".rjust(int(width/2-(len(loop))))))
i = i+1
else:
print(("| " + description.ljust(int(width/2)) + " | " + value + "|".rjust(int(width/2-(len(value))))))
# Print Exit Status in English
def prettyStatus(exitStatus):
if exitStatus == 0:
printStatus = "OK"
elif exitStatus == 1:
printStatus = "*Warning"
elif exitStatus == 2:
printStatus = "*Critical"
elif exitStatus == 3:
printStatus = "*Unknown"
return printStatus
# Check the command line options
commandOpts = commandLineOptions()
ip = commandOpts[0]
port = commandOpts[1]
username = commandOpts[2]
password = commandOpts[3]
ipType = commandOpts[4]
# Check to see if we were provided a name, and check that we can resolve it.
if ipCheck(ip) == False:
checkName(ip)
if ipType == 'node':
jsonData = json.dumps({"method": "GetClusterState", "params": {}, "id": 1})
try:
response = sendRequest(ip, port, murl, username,
password, jsonData, ipType)
clusterState = response['result']['state']
except:
printUsage("State not found, are you sure this is a storage node IP?")
if clusterState != "Active":
exitStatus = STATE_UNKNOWN
clusterMvip = "n/a"
clusterName = "n/a"
else:
clusterName = response['result']['cluster']
jsonData = json.dumps(
{"method": "TestConnectMvip", "params": {}, "id": 1})
response = sendRequest(ip, port, murl, username,
password, jsonData, ipType)
details = response['result']['details']
if 'mvip' in details:
clusterMvip = details['mvip']
exitStatus = STATE_OK
else:
clusterMvip = "*n/a Not in Cluster"
exitStatus = STATE_WARNING
if sys.stdout.isatty():
print(("+" + "-"*63 + "+"))
print(("| SolidFire Monitoring Plugin " + version + "|".rjust(20)))
print(("+" + "-"*63 + "+"))
prettyPrint("Node Status", clusterState, 60)
prettyPrint("Cluster Name", clusterName, 60)
prettyPrint("MVIP", clusterMvip, 60)
prettyPrint("Execution Time ", time.asctime(
time.localtime(time.time())), 60)
printStatus = prettyStatus(exitStatus)
prettyPrint("Exit State ", printStatus, 60)
print(("+" + "-"*63 + "+"))
else:
printStatus = prettyStatus(exitStatus)
print(("State: " + printStatus + " Node Status: " + clusterState +
" Cluster Name: " + clusterName + " MVIP: " + clusterMvip))
elif ipType == 'mvip':
# Get bytes and utilization from GetClusterStats
jsonData = json.dumps({"method": "GetClusterStats", "params": { "nodeID": "1", "force": "True"}, "id": 1})
response = sendRequest(ip, port, murl, username, password, jsonData, ipType)
details = response['result']['clusterStats']
clusterReadBytes = str(details['readBytes'])
clusterWriteBytes = str(details['writeBytes'])
clusterUse = str(details['clusterUtilization'])
# Get ISCSI sessions from ListISCSISessions
jsonData = json.dumps(
{"method": "ListISCSISessions", "params": {}, "id": 1})
response = sendRequest(ip, port, murl, username, password, jsonData, ipType)
details = response['result']['sessions']
numSessions = len(details)
# get version info
jsonData = json.dumps(
{"method": "GetClusterVersionInfo", "params": {}, "id": 1})
response = sendRequest(ip, port, murl, username,
password, jsonData, ipType)
clusterVersion = response['result']['clusterVersion']
# Get Active Storage Nodes and (v12.0+) not in Maintenance Mode
jsonData = json.dumps({"method": "ListActiveNodes", "params": {}, "id": 1})
response = sendRequest(ip, port, murl, username, password, jsonData, ipType)
details = response['result']['nodes']
storageNodeCount = 0
for node in range(len(details)):
if clusterVersion >= "12.0":
mm = response['result']['nodes'][node]['maintenanceMode']
if (mm != 'ReadyForMaintenance' or mm != 'PreparingForMaintenance' \
or mm != 'Disabled') and (response['result']['nodes'][node]['role'] == 'Storage'):
storageNodeCount = storageNodeCount + 1
else:
storageNodeCount = storageNodeCount + 1
# Get name and members from GetClusterInfo
jsonData = json.dumps({"method": "GetClusterInfo", "params": {}, "id": 1})
response = sendRequest(ip, port, murl, username,
password, jsonData, ipType)
details = response['result']['clusterInfo']
clusterName = details['name']
ensemble = details['ensemble']
ensembleCount = len(ensemble)
# Check Cluster Faults
if checkClusterFaults == 1:
clusterFaults = ""
jsonData = json.dumps(
{"method": "ListClusterFaults", "params": {}, "id": 1})
response = sendRequest(ip, port, murl, username,
password, jsonData, ipType)
clusterFaultsResponse = response['result']['faults']
for fault in clusterFaultsResponse:
if fault['resolved'] != True:
testResult = STATE_CRITICAL
date = fault['date'][:-8]
if clusterFaults == "":
clusterFaults = date + " " + fault['details']
else:
clusterFaults = clusterFaults + ", " + \
date + " " + fault['details']
if clusterFaults == "":
clusterFaults = "None"
testResult = STATE_OK
exitStatus, clusterFaults = addNote(
testResult, exitStatus, clusterFaults)
else:
clusterFaults = "n/a"
if checkDiskUse == 1:
fileName = "/tmp/cluster-" + ip + ".txt"
# print(fileName)
newUse = clusterReadBytes + clusterWriteBytes
diskUse, testResult = readwriteCheck(fileName, newUse)
exitStatus, diskUse = addNote(testResult, exitStatus, diskUse)
else:
diskUse = "n/a"
if checkUtilization == 1:
testResult = rangeCheck(90, 80, float(clusterUse))
exitStatus, clusterUse = addNote(testResult, exitStatus, clusterUse)
# Element OS < v11.8 has a soft limit of 400 active volumes and 700
# active sessions per node
# Element OS v11.8-12.3 support 1,000 volumes/node but the details aren't
# yet documented (limited info is available in GetLimits response)
# NOTE: 2-storage node SolidFire clusters use virtual Witness Nodes
# I don't have such cluster to test with, but I assume only
# storage nodes have the role 'Storage', which is considered above
# Now one node is deducted from total storage node count to allow for HA
if clusterVersion >= "11.8":
maxSessions = int((storageNodeCount-1) * 1000 * .90)
else:
maxSessions = int((storageNodeCount-1) * 700 * .90)
warnSessions = int(maxSessions * .80)
if checkSessions == 1:
testResult = rangeCheck(maxSessions, warnSessions, numSessions)
exitStatus, numSessions = addNote(
testResult, exitStatus, str(numSessions))
# check to see if we are being called from a terminal
if sys.stdout.isatty():
print(("+" + "-"*63 + "+"))
print(("| SolidFire Monitoring Plugin " + version + "|".rjust(20)))
print(("+" + "-"*63 + "+"))
prettyPrint("Cluster", ip, 60)
prettyPrint("Version", str(clusterVersion), 60)
prettyPrint("Disk Activity", diskUse, 60)
prettyPrint("Read Bytes", clusterReadBytes, 60)
prettyPrint("Write Bytes", clusterWriteBytes, 60)
prettyPrint("Utilization %", clusterUse, 60)
prettyPrint("iSCSI Sessions", numSessions, 60)
prettyPrint("Cluster Faults", clusterFaults, 60)
prettyPrint("Cluster Name", clusterName, 60)
prettyPrint("Ensemble Members", str(
'[%s]' % ', '.join(map(str, ensemble))), 60)
prettyPrint("Execution Time ", time.asctime(
time.localtime(time.time())), 60)
prettyPrint("Exit State ", prettyStatus(exitStatus), 60)
print(("+" + "-"*63 + "+"))
else:
print(("Status: " + prettyStatus(exitStatus) + " Cluster IP: " + ip + " Version: " + clusterVersion +
" Disk Activity: " + diskUse + " Cluster Faults: " + clusterFaults +
" Read Bytes: " + clusterReadBytes + " Write Bytes: " + clusterWriteBytes +
" Utilization: " + clusterUse + " ISCSI Sessions: " + str(numSessions) +
" Name: " + clusterName + " Ensemble: " + '[%s]' % ', '.join(map(str, ensemble))))
sys.exit(exitStatus)