forked from khrpcek/check_ceph
-
Notifications
You must be signed in to change notification settings - Fork 1
/
check_ceph.py
executable file
·257 lines (228 loc) · 12.3 KB
/
check_ceph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
#!/usr/bin/python
# Examples:
# osd status, warn at 2 missing, crit at 3: ./check_ceph.py -C ceph.conf --id icinga -k ceph.client.icinga.keyring --osd -w 2 -c 3
# general health statis: /check_ceph.py -C ceph.conf --id icinga -k ceph.client.icinga.keyring --health
# pg status, does not take warning or critical arguments yet. Only warns on PGs not in an active+clean state which means some PGs are not in an optimal state. ./check_ceph.py -C ceph.conf --id icinga -k ceph.client.icinga.keyring --pg
# extra performance metrics (iops, read/write bytes/sec): ./check_ceph.py -C ceph.conf --id icinga -k ceph.client.icinga.keyring --perf
# disk space, if run with --pool you only alert on that pool. when run without --pool the thresholds are for every pool. warning and ciritcal are the max avail fields from `ceph df`: ./check_ceph.py -C ceph.conf --id icinga -k ceph.client.icinga.keyring --df -w 100 -c 50
#
#
import sys
import argparse
import json
import subprocess
# ceph osd stat
# ceph mon stat
# ceph pg stat
# ceph health statua
# ceph mon_status
# ceph quorum status
def checkHealth(args):
ceph_health_json = subprocess.check_output(
["ceph --id {0} -c {1} -k {2} --format json health".format(args.id, args.conf, args.keyring)], shell=True)
ceph_health_dict = json.loads(ceph_health_json)
if ceph_health_dict['status'] == 'HEALTH_ERR':
try:
print "%s: %s" % (ceph_health_dict['overall_status'], ceph_health_dict['summary'][0]['summary'])
except KeyError:
print "%s: %s" % (ceph_health_dict['status'], ceph_health_dict['checks'].keys()[0])
sys.exit(2)
elif ceph_health_dict['status'] == 'HEALTH_WARN':
try:
print "%s: %s" % (ceph_health_dict['overall_status'], ceph_health_dict['summary'][0]['summary'])
except KeyError:
print "%s: %s" % (ceph_health_dict['status'], ceph_health_dict['checks'].keys()[0])
sys.exit(1)
elif ceph_health_dict['status'] == 'HEALTH_OK':
print "%s" % (ceph_health_dict['status'])
sys.exit(0)
def checkOSD(args):
if args.warning:
WARN = float(args.warning)
if args.critical:
CRIT = float(args.critical)
osd_stat_json = subprocess.check_output(
["ceph --id {0} -c {1} -k {2} --format json osd stat".format(args.id, args.conf, args.keyring)], shell=True)
osd_stat_dict = json.loads(osd_stat_json)
try:
osd_not_up = osd_stat_dict['num_osds'] - osd_stat_dict['num_up_osds']
except KeyError:
osd_stat_dict = osd_stat_dict['osdmap']
osd_not_up = osd_stat_dict['num_osds'] - osd_stat_dict['num_up_osds']
osd_not_in = osd_stat_dict['num_osds'] - osd_stat_dict['num_in_osds']
perf_string = "num_osds={0} num_up_osds={1} num_in_osds={2}".format(
osd_stat_dict['num_osds'], osd_stat_dict['num_up_osds'], osd_stat_dict['num_in_osds'])
# Build in logic to handle the full and near full keys that are returned in the json
if (osd_not_up >= WARN and osd_not_up < CRIT) or (osd_not_in >= WARN and osd_not_in < CRIT):
print "WARNING: ALL OSDs are not up and in. {0} OSDS. {1} up, {2} in|{3}".format(osd_stat_dict['num_osds'], osd_stat_dict['num_up_osds'], osd_stat_dict['num_in_osds'], perf_string)
sys.exit(1)
elif (osd_not_up >= CRIT) or (osd_not_in >= CRIT):
print "CRITICAL: ALL OSDs are not up and in. {0} OSDS. {1} up, {2} in|{3}".format(osd_stat_dict['num_osds'], osd_stat_dict['num_up_osds'], osd_stat_dict['num_in_osds'], perf_string)
sys.exit(2)
elif (osd_stat_dict['num_osds'] == osd_stat_dict['num_in_osds']) and (osd_stat_dict['num_osds'] == osd_stat_dict['num_up_osds']):
print "ALL OSDs are up and in. {0} OSDS. {1} up, {2} in|{3}".format(osd_stat_dict['num_osds'], osd_stat_dict['num_up_osds'], osd_stat_dict['num_in_osds'], perf_string)
sys.exit(0)
else:
print "Script shouldn't reach this point. Thar be bugs!"
sys.exit(3)
def checkMON(args):
if args.warning:
WARN = float(args.warning)
if args.critical:
CRIT = float(args.critical)
# not written yet, more important things
def checkPG(args):
pg_stat_json = subprocess.check_output(
["ceph --id {0} -c {1} -k {2} --format json pg stat".format(args.id, args.conf, args.keyring)], shell=True)
pg_stat_dict = json.loads(pg_stat_json)
# cheap fix for nautilus change in json output
if 'num_pgs' in pg_stat_dict.keys():
# pre nautilus json format
pg_summary = pg_stat_dict
elif 'pg_summary' in pg_stat_dict.keys():
# nautilus json format
pg_summary = pg_stat_dict['pg_summary']
num_pgs = pg_summary['num_pgs']
active_pgs = 0
perf_string = ""
for x in pg_summary['num_pg_by_state']:
if "active+clean" in x['name']:
active_pgs += x['num']
perf_string += "%s=%s " % (x['name'], x['num'])
# Maybe build in a percentage based threshold for users who want to have thresholds like that
if active_pgs < num_pgs:
print "WARNING: All PGs are not active+clean: {0} PGs Total, {1}|{1}".format(num_pgs, perf_string)
sys.exit(1)
elif active_pgs == num_pgs:
print "All PGs are active+clean: {0} PGs Total, {1}|{1}".format(num_pgs, perf_string)
sys.exit(0)
else:
print "Script shouldn't reach this point. Thar be bugs!"
sys.exit(3)
def checkPerf(args):
pg_stat_json = subprocess.check_output(
["ceph --id {0} -c {1} -k {2} --format json pg stat".format(args.id, args.conf, args.keyring)], shell=True)
pg_stat_dict = json.loads(pg_stat_json)
if 'read_bytes_sec' not in pg_stat_dict:
pg_stat_dict['read_bytes_sec'] = 0
if 'write_bytes_sec' not in pg_stat_dict:
pg_stat_dict['write_bytes_sec'] = 0
if 'io_sec' not in pg_stat_dict:
pg_stat_dict['io_sec'] = 0
perf_string = "read_bytes_sec={0} write_bytes_sec={1} io_sec={2}".format(
pg_stat_dict['read_bytes_sec'], pg_stat_dict['write_bytes_sec'], pg_stat_dict['io_sec'])
print "Healthy: Additional perf stats for cluster {0}|{0}".format(perf_string)
sys.exit(0)
def checkDF(args):
if args.warning:
WARN = float(args.warning)
if args.critical:
CRIT = float(args.critical)
if args.byte:
if args.byte == "T":
byte_divisor = 1024**4
perf_metric = "TB"
elif args.byte == "G":
byte_divisor = 1024**3
perf_metric = "GB"
elif args.byte == "P":
byte_divisor = 1024**5
perf_metric = "PB"
else:
byte_divisor = 1024**4
perf_metric = "TB"
ceph_df_json = subprocess.check_output(
["ceph --id {0} -c {1} -k {2} --format json df".format(args.id, args.conf, args.keyring)], shell=True)
ceph_df_dict = json.loads(ceph_df_json)
# get global stats
global_bytes, global_used_bytes, global_avail_bytes = ceph_df_dict['stats'][
'total_bytes'], ceph_df_dict['stats']['total_used_bytes'], ceph_df_dict['stats']['total_avail_bytes']
global_total = global_bytes/byte_divisor
global_used = global_used_bytes/byte_divisor
global_avail = global_avail_bytes/byte_divisor
# get all pool stats
pool_stats = {}
for pool in ceph_df_dict['pools']:
pool_stats[pool['name']] = {'bytes_used': pool['stats']['bytes_used']/byte_divisor,
'max_avail': pool['stats']['max_avail']/byte_divisor, 'objects': pool['stats']['objects']}
perf_string = "global_total_bytes={0}{3} global_used_bytes={1}{3} global_avail_bytes={2}{3} ".format(
global_bytes/byte_divisor, global_used_bytes/byte_divisor, global_avail_bytes/byte_divisor, perf_metric)
for item in pool_stats.keys():
perf_string += "{0}_bytes_used={1}{2} {0}_max_avail={3}{2} {0}_objects={4} ".format(
item, pool_stats[item]['bytes_used'], perf_metric, pool_stats[item]['max_avail'], pool_stats[item]['objects'])
# if pool is defined alert on that. if pool is not defined alert on the max_avail of all pools if any cross threshold
if args.pool in pool_stats.keys():
# print pool_stats[args.pool]
# add in percentage later
if (pool_stats[args.pool]['max_avail'] < WARN) and (pool_stats[args.pool]['max_avail'] > CRIT):
print "WARNING: Ceph pool {0} has {1}{2} availbale|{3}".format(args.pool, pool_stats[args.pool]['max_avail'], perf_metric, perf_string)
sys.exit(1)
elif pool_stats[args.pool]['max_avail'] < CRIT:
print "CRITICAL: Ceph pool {0} has {1}{2} availbale|{3}".format(args.pool, pool_stats[args.pool]['max_avail'], perf_metric, perf_string)
sys.exit(2)
elif pool_stats[args.pool]['max_avail'] > WARN:
print "Healthy: Ceph pool {0} has {1}{2} availbale|{3}".format(args.pool, pool_stats[args.pool]['max_avail'], perf_metric, perf_string)
sys.exit(0)
else:
print "Script shouldn't reach this point. Thar be bugs!"
sys.exit(3)
else:
# Alerts based on all pools. If any pool is crossing the threshold we alert on it
warn_list = []
crit_list = []
for key in pool_stats.keys():
if (pool_stats[key]['max_avail'] < WARN) and (pool_stats[key]['max_avail'] > CRIT):
warn_list.append("%s:%s%s" % (
key, pool_stats[key]['max_avail'], perf_metric))
elif pool_stats[key]['max_avail'] < CRIT:
crit_list.append("%s:%s%s" % (
key, pool_stats[key]['max_avail'], perf_metric))
if (len(warn_list) > 0) and (len(crit_list) == 0):
print "WARNING: Ceph pool(s) low on free space. {0}|{1}".format(warn_list, perf_string)
sys.exit(1)
elif len(crit_list) > 0:
print "CRITICAL: Ceph pool(s) critically low on free space. Critial:{0} Warning:{1}|{2}".format(crit_list, warn_list, perf_string)
sys.exit(2)
elif (len(warn_list) == 0) and (len(crit_list) == 0):
print "Healthy: All ceph pools are within free space thresholds|{0}".format(perf_string)
else:
print "Script shouldn't reach this point. Thar be bugs!"
sys.exit(3)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description='Runs health checks against a ceph cluster. This is designed to run on the monitoring server using the ceph client software. Supply a ceph.conf, keyring, and user to access the cluster.')
parser.add_argument(
'-C', '--conf', help='ceph.conf file, defaults to /etc/ceph/ceph.conf.')
parser.add_argument('-id', '--id', help='Ceph authx user', required=True)
parser.add_argument(
'-k', '--keyring', help='Path to ceph keyring if not in /etc/ceph/client.\$id.keyring')
parser.add_argument(
'--health', help='Get general health status. ex. HEALTH_OK, HEALTH_WARN', action="store_true")
parser.add_argument(
'-o', '--osd', help='OSD status. Thresholds are in number of OSDs missing', action="store_true")
parser.add_argument(
'-m', '--mon', help='MON status. Thesholds are in number of mons missing')
parser.add_argument(
'-p', '--pg', help='PG status. No thresholds due to the large number of pg states.', action="store_true")
parser.add_argument(
'--perf', help='collects additional ceph performance statistics', action='store_true')
parser.add_argument('--df', help='Disk/cluster usage. Reports global and all pools unless --pool is used. Warning and critical are number of -b free to the pools. This is not Raw Free, but Max Avail to the pools based on rep or k,m settings. If you do not define a pool the threshold is run agains all the pools in the cluster.', action="store_true")
parser.add_argument(
'-b', '--byte', help="Format to use for displaying DF data. G=Gigabyte, T=Terabyte. Use with the --df option. Defults to TB")
parser.add_argument('--pool', help='Pool. Use with df')
parser.add_argument('--objects', help='Object counts based on pool')
parser.add_argument(
'-w', '--warning', help='Warning threshold. See specific checks for value types')
parser.add_argument(
'-c', '--critical', help='Critical threshold. See specific checks for value types')
args = parser.parse_args()
if args.health:
checkHealth(args)
elif args.osd:
checkOSD(args)
elif args.pg:
checkPG(args)
elif args.df:
checkDF(args)
elif args.perf:
checkPerf(args)