Permalink
Browse files

nagios + twilio integration

  • Loading branch information...
0 parents commit 404ecf372030642b93a576f3eadb2f897227bf13 @polvi committed Jun 6, 2009
8 .gitignore
@@ -0,0 +1,8 @@
+*.py[co]
+*.sw[po]
+*~
+PYSMELLTAGS*
+_trial*
+*pid
+*log
+dropin.cache
82 README
@@ -0,0 +1,82 @@
+Twilio + Nagios = Easy phone based monitoring and alerts
+--------------------------------------------------------
+
+This package contains two pieces of software:
+
+1) A script used as a notification command for nagios, which integrates nagios and twilio
+2) A webserver which exposes nagios status files via Twilio XML
+
+Put together, it is possible to wire up nagios such that any host or service check will send notifications via twilio, i.e. call you.
+
+Dependencies
+------------
+* A current version of twisted python
+ apt-get install python-twisted or http://twistedmatrix.com/trac/
+* Twilio python libraries
+ http://www.twilio.com/docs/libraries/
+* A nagios installation
+ apt-get install nagios3 or http://www.nagios.org/download/download.php
+
+Nagios Setup
+------------
+
+First, you must get nagios working. Included in this package is an example nagios config, which will be enough to get nagios monitoring ping and http on your local interface.
+
+ example config: examples/nagios.cfg
+
+After installing nagios, set a cfg_file option to point to the examples/nagios.cfg. The nagios conf is generally located at:
+
+ /etc/nagios/nagios.cfg
+
+After you add this line, your nagios config will look something like this:
+
+ cfg_file=/path/to/the/example/nagios.cfg
+ ...
+ object_cache_file=/opt/local/var/nagios/objects.cache
+ precached_object_file=/opt/local/var/nagios/objects.precache
+ resource_file=/opt/local/etc/nagios/resource.cfg
+ ... and so on ...
+
+For the purpose of testing, you should only have one cfg_file option, and no cfg_dir options. Will will restrict the nagios install to just the example nagios.cfg included with this distribution.
+
+The included nagios.cfg requires a few modifications to get fully working. The comments in example/nagios.cfg explain what needs to be changed.
+
+Next, you will need to update scripts/nagios_twilio_pager.py with your Twilio credentials and phone numbers. Should look something like this:
+
+ # Twilio REST API version
+ API_VERSION = '2008-08-01'
+ ACCOUNT_SID = 'YOUR SID HERE'
+ ACCOUNT_TOKEN = 'YOUR TOKEN HERE'
+ # needs to be registered with twillo
+ CALLER_ID = 'YOUR-CALLER-ID';
+
+ # this needs point to where you run your "twistd twilio_nagios" instance
+ MONITOR_URL = 'http://localhost:8080'
+
+Assuming you got all the paths setup in the nagios.cfg, you should be able to start nagios now.
+
+Twilio Responder
+----------------
+
+The second service is what service twilio hits to figure out what to say. This works by parsing the nagios status files, based on the params sent to it via twilio, and generating Twilio XML accordingly.
+
+For testing purposes, you can start it with this command:
+
+ twistd -n twilio_nagios -o test/dat/objects.cache -s test/dat/status.dat
+
+Then, navigate to:
+
+ http://localhost:8080/host/host/localhost
+
+And you should see Twilio XML:
+
+ <Response>
+ <Say>the host production database server is up</Say>
+ </Response>
+
+This webserver will need to be ran on a server that is accessible by Twilio. I.e. you will not be able to test this running locally, unless your local machine has a public, internet facing, ipaddress.
+
+How it works
+------------
+
+When nagios determines it is the time to send a notification it sends data to the nagios_twilio_pager.py script. This will fire off a request to Twilio, telling it to hit a specific URL for the host that had the problem. Twilio will then hit a url on the twilio_nagios responder, which will parse the nagios config and objects file to look up the current state. It then returns that information in Twilio XML, and the sysadmin gets alerted by a robot voice. Success.
126 examples/nagios.cfg
@@ -0,0 +1,126 @@
+### Contact, i.e. the person that gets called
+# this is where you set the phone number for nagios to call
+# change the contact_name to something more appriopriate
+define contact {
+ use admin-via-twilio
+ contact_name admin
+ alias admin
+ pager 5551234567 ; replace with the phone number you want twilio to call
+}
+
+### Host definition and check
+# example host and service that will page via twilio if there is a problem
+# if the notes section is definied, twilio will read that instead of the host_name
+define host {
+ use host-template
+ host_name localhost
+ alias localhost
+ address 127.0.0.1
+ contacts admin
+ check_command check-host-alive
+ notes production database server
+}
+
+### Service Check
+# update the contact to match what you want
+define service{
+ use service-template
+ host_name localhost
+ service_description http
+ check_command check_http
+ contacts admin
+}
+
+## Twilio specific commands
+# these are the command definitions required to use the twilio scripts
+# you need update the command_line to point to where you installed the script
+define command{
+ command_name notify-host-by-phone
+ command_line /path/to/nagios_twilio_pager.py $CONTACTPAGER$ host host $HOSTNAME$
+}
+
+define command{
+ command_name notify-service-by-phone
+ command_line /path/to/nagios_twilio_pager.py $CONTACTPAGER$ service $SERVICEDESC$ $HOSTNAME$
+}
+
+
+### Templates and other stuff -- nothing to change below this line to get it to work
+# template for setting up admins to use the twilio pager
+define contact{
+ name admin-via-twilio
+ service_notification_period 24x7
+ host_notification_period 24x7
+ service_notification_options w,u,c,r,f,s
+ host_notification_options d,u,r,f,s
+ service_notification_commands notify-service-by-phone
+ host_notification_commands notify-host-by-phone
+ register 0
+}
+
+###
+# begin template stuff and other needed things to get nagios to run
+define host{
+ name host-template
+ max_check_attempts 3
+ notification_interval 120
+ notifications_enabled 1
+ event_handler_enabled 1
+ flap_detection_enabled 1
+ check_period 24x7
+ check_interval 5
+ retry_interval 1
+ retain_status_information 1
+ notification_interval 120
+ register 0
+}
+
+define service{
+ name service-template
+ active_checks_enabled 1
+ passive_checks_enabled 1
+ parallelize_check 1
+ obsess_over_service 1
+ check_freshness 0
+ notifications_enabled 1
+ event_handler_enabled 1
+ flap_detection_enabled 1
+ failure_prediction_enabled 1
+ process_perf_data 1
+ retain_status_information 1
+ retain_nonstatus_information 1
+ is_volatile 0
+ check_period 24x7
+ max_check_attempts 3
+ normal_check_interval 10
+ retry_check_interval 2
+ notification_options w,u,c,r
+ notification_interval 60
+ notification_period 24x7
+ event_handler_enabled 0
+ max_check_attempts 3
+ normal_check_interval 3
+ retry_check_interval 1
+ register 0
+}
+
+define timeperiod{
+ timeperiod_name 24x7
+ alias 24 Hours A Day, 7 Days A Week
+ sunday 00:00-24:00
+ monday 00:00-24:00
+ tuesday 00:00-24:00
+ wednesday 00:00-24:00
+ thursday 00:00-24:00
+ friday 00:00-24:00
+ saturday 00:00-24:00
+}
+
+define command{
+ command_name check-host-alive
+ command_line $USER1$/check_ping -H $HOSTADDRESS$ -w 3000.0,80% -c 5000.0,100% -p 5
+}
+define command{
+ command_name check_http
+ command_line $USER1$/check_http -I $HOSTADDRESS$ $ARG1$
+ }
30 scripts/nagios_twilio_pager.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+
+import twiliorest
+import sys
+
+# Twilio REST API version
+API_VERSION = '2008-08-01'
+ACCOUNT_SID = 'YOUR SID HERE'
+ACCOUNT_TOKEN = 'YOUR TOKEN HERE'
+# needs to be registered with twillo
+CALLER_ID = 'YOUR-CALLER-ID';
+
+# this needs point to where you run your "twistd twilio_nagios" instance
+MONITOR_URL = 'http://localhost:8080'
+
+# Create a Twilio REST account object using your Twilio account ID and token
+account = twiliorest.Account(ACCOUNT_SID, ACCOUNT_TOKEN)
+
+number = sys.argv[1]
+type = sys.argv[2]
+service = sys.argv[3]
+host = sys.argv[4]
+
+d = {
+ 'Caller' : CALLER_ID,
+ 'Called' : number,
+ 'Url' : '%s/%s/%s/%s' % (MONITOR_URL, type, service, host),
+}
+print account.request('/%s/Accounts/%s/Calls' % \
+ (API_VERSION, ACCOUNT_SID), 'POST', d)
125 test/dat/objects.cache
@@ -0,0 +1,125 @@
+########################################
+# NAGIOS OBJECT CACHE FILE
+#
+# THIS FILE IS AUTOMATICALLY GENERATED
+# BY NAGIOS. DO NOT MODIFY THIS FILE!
+#
+# Created: Fri Jun 5 17:48:42 2009
+########################################
+
+define timeperiod {
+ timeperiod_name 24x7
+ alias 24 Hours A Day, 7 Days A Week
+ sunday 00:00-24:00
+ monday 00:00-24:00
+ tuesday 00:00-24:00
+ wednesday 00:00-24:00
+ thursday 00:00-24:00
+ friday 00:00-24:00
+ saturday 00:00-24:00
+ }
+
+define command {
+ command_name check-host-alive
+ command_line $USER1$/check_ping -H $HOSTADDRESS$ -w 3000.0,80% -c 5000.0,100% -p 5
+ }
+
+define command {
+ command_name check_http
+ command_line $USER1$/check_http -I $HOSTADDRESS$ $ARG1$
+ }
+
+define command {
+ command_name notify-host-by-phone
+ command_line /path/to/nagios_twilio_pager.py $CONTACTPAGER$ host host $HOSTNAME$
+ }
+
+define command {
+ command_name notify-service-by-phone
+ command_line /path/to/nagios_twilio_pager.py $CONTACTPAGER$ service $SERVICEDESC$ $HOSTNAME$
+ }
+
+define contact {
+ contact_name admin
+ alias admin
+ service_notification_period 24x7
+ host_notification_period 24x7
+ service_notification_options w,u,c,r,f,s
+ host_notification_options d,u,r,f,s
+ service_notification_commands notify-service-by-phone
+ host_notification_commands notify-host-by-phone
+ pager 5551234567
+ host_notifications_enabled 1
+ service_notifications_enabled 1
+ can_submit_commands 1
+ retain_status_information 1
+ retain_nonstatus_information 1
+ }
+
+define host {
+ host_name localhost
+ alias localhost
+ address 127.0.0.1
+ check_period 24x7
+ check_command check-host-alive
+ contacts admin
+ initial_state o
+ check_interval 5.000000
+ retry_interval 1.000000
+ max_check_attempts 3
+ active_checks_enabled 1
+ passive_checks_enabled 1
+ obsess_over_host 1
+ event_handler_enabled 1
+ low_flap_threshold 0.000000
+ high_flap_threshold 0.000000
+ flap_detection_enabled 1
+ flap_detection_options o,d,u
+ freshness_threshold 0
+ check_freshness 0
+ notification_options d,u,r,f,s
+ notifications_enabled 1
+ notification_interval 120.000000
+ first_notification_delay 0.000000
+ stalking_options n
+ process_perf_data 1
+ failure_prediction_enabled 1
+ notes production database server
+ retain_status_information 1
+ retain_nonstatus_information 1
+ }
+
+define service {
+ host_name localhost
+ service_description http
+ check_period 24x7
+ check_command check_http
+ contacts admin
+ notification_period 24x7
+ initial_state o
+ check_interval 3.000000
+ retry_interval 1.000000
+ max_check_attempts 3
+ is_volatile 0
+ parallelize_check 1
+ active_checks_enabled 1
+ passive_checks_enabled 1
+ obsess_over_service 1
+ event_handler_enabled 0
+ low_flap_threshold 0.000000
+ high_flap_threshold 0.000000
+ flap_detection_enabled 1
+ flap_detection_options o,w,u,c
+ freshness_threshold 0
+ check_freshness 0
+ notification_options u,w,c,r
+ notifications_enabled 1
+ notification_interval 60.000000
+ first_notification_delay 0.000000
+ stalking_options n
+ process_perf_data 1
+ failure_prediction_enabled 1
+ retain_status_information 1
+ retain_nonstatus_information 1
+ }
+
185 test/dat/status.dat
@@ -0,0 +1,185 @@
+########################################
+# NAGIOS STATUS FILE
+#
+# THIS FILE IS AUTOMATICALLY GENERATED
+# BY NAGIOS. DO NOT MODIFY THIS FILE!
+########################################
+
+info {
+ created=1244249352
+ version=3.0.6
+ }
+
+programstatus {
+ modified_host_attributes=0
+ modified_service_attributes=0
+ nagios_pid=20338
+ daemon_mode=1
+ program_start=1244249322
+ last_command_check=1244249351
+ last_log_rotation=0
+ enable_notifications=1
+ active_service_checks_enabled=1
+ passive_service_checks_enabled=1
+ active_host_checks_enabled=1
+ passive_host_checks_enabled=1
+ enable_event_handlers=1
+ obsess_over_services=0
+ obsess_over_hosts=0
+ check_service_freshness=1
+ check_host_freshness=0
+ enable_flap_detection=1
+ enable_failure_prediction=1
+ process_performance_data=0
+ global_host_event_handler=
+ global_service_event_handler=
+ next_comment_id=4
+ next_downtime_id=1
+ next_event_id=133
+ next_problem_id=61
+ next_notification_id=1821
+ total_external_command_buffer_slots=4096
+ used_external_command_buffer_slots=0
+ high_external_command_buffer_slots=0
+ active_scheduled_host_check_stats=1,1,1
+ active_ondemand_host_check_stats=0,0,0
+ passive_host_check_stats=0,0,0
+ active_scheduled_service_check_stats=0,0,0
+ active_ondemand_service_check_stats=0,0,0
+ passive_service_check_stats=0,0,0
+ cached_host_check_stats=0,0,0
+ cached_service_check_stats=0,0,0
+ external_command_stats=0,0,0
+ parallel_host_check_stats=1,1,1
+ serial_host_check_stats=0,0,0
+ }
+
+hoststatus {
+ host_name=localhost
+ modified_attributes=0
+ check_command=check-host-alive
+ check_period=24x7
+ notification_period=
+ check_interval=5.000000
+ retry_interval=1.000000
+ event_handler=
+ has_been_checked=1
+ should_be_scheduled=1
+ check_execution_time=4.056
+ check_latency=0.735
+ check_type=0
+ current_state=0
+ last_hard_state=0
+ last_event_id=0
+ current_event_id=0
+ current_problem_id=0
+ last_problem_id=0
+ plugin_output=PING OK - Packet loss = 0%, RTA = 0.09 ms
+ long_plugin_output=
+ performance_data=rta=0.091000ms;3000.000000;5000.000000;0.000000 pl=0%;80;100;0
+ last_check=1244249322
+ next_check=1244249632
+ check_options=0
+ current_attempt=1
+ max_attempts=3
+ current_event_id=0
+ last_event_id=0
+ state_type=1
+ last_state_change=0
+ last_hard_state_change=0
+ last_time_up=1244249332
+ last_time_down=0
+ last_time_unreachable=0
+ last_notification=0
+ next_notification=0
+ no_more_notifications=0
+ current_notification_number=0
+ current_notification_id=0
+ notifications_enabled=1
+ problem_has_been_acknowledged=0
+ acknowledgement_type=0
+ active_checks_enabled=1
+ passive_checks_enabled=1
+ event_handler_enabled=1
+ flap_detection_enabled=1
+ failure_prediction_enabled=1
+ process_performance_data=1
+ obsess_over_host=1
+ last_update=1244249352
+ is_flapping=0
+ percent_state_change=0.00
+ scheduled_downtime_depth=0
+ }
+
+servicestatus {
+ host_name=localhost
+ service_description=http
+ modified_attributes=0
+ check_command=check_http
+ check_period=24x7
+ notification_period=24x7
+ check_interval=3.000000
+ retry_interval=1.000000
+ event_handler=
+ has_been_checked=0
+ should_be_scheduled=1
+ check_execution_time=0.000
+ check_latency=0.000
+ check_type=0
+ current_state=0
+ last_hard_state=0
+ last_event_id=0
+ current_event_id=0
+ current_problem_id=0
+ last_problem_id=0
+ current_attempt=1
+ max_attempts=3
+ current_event_id=0
+ last_event_id=0
+ state_type=1
+ last_state_change=0
+ last_hard_state_change=0
+ last_time_ok=0
+ last_time_warning=0
+ last_time_unknown=0
+ last_time_critical=0
+ plugin_output=
+ long_plugin_output=
+ performance_data=
+ last_check=0
+ next_check=1244249502
+ check_options=0
+ current_notification_number=0
+ current_notification_id=0
+ last_notification=0
+ next_notification=0
+ no_more_notifications=0
+ notifications_enabled=1
+ active_checks_enabled=1
+ passive_checks_enabled=1
+ event_handler_enabled=0
+ problem_has_been_acknowledged=0
+ acknowledgement_type=0
+ flap_detection_enabled=1
+ failure_prediction_enabled=1
+ process_performance_data=1
+ obsess_over_service=1
+ last_update=1244249352
+ is_flapping=0
+ percent_state_change=0.00
+ scheduled_downtime_depth=0
+ }
+
+contactstatus {
+ contact_name=admin
+ modified_attributes=0
+ modified_host_attributes=0
+ modified_service_attributes=0
+ host_notification_period=24x7
+ service_notification_period=24x7
+ last_host_notification=0
+ last_service_notification=0
+ host_notifications_enabled=1
+ service_notifications_enabled=1
+ }
+
1 twilionagios/__init__.py
@@ -0,0 +1 @@
+from twilio_nagios import TwilioNagios
113 twilionagios/twilio_nagios.py
@@ -0,0 +1,113 @@
+#!/usr/bin/python
+from twisted.web import server, resource
+from twisted.web.resource import Resource
+from twisted.application import internet, service
+
+import re
+
+def parse_objects(file):
+ filename = file
+ conf = []
+ f = open(filename, 'r')
+ for i in f.readlines():
+ if i[0] == '#': continue
+ matchID = re.search(r"define ([\w]+) {", i)
+ matchAttr = re.search(r"[ ]*([\w]+)\s+(.*)$", i)
+ matchEndID = re.search(r"[ ]*}", i)
+ if matchID:
+ identifier = matchID.group(1)
+ cur = [identifier, {}]
+ elif matchAttr:
+ attribute = matchAttr.group(1)
+ value = matchAttr.group(2)
+ cur[1][attribute] = value
+ elif matchEndID:
+ conf.append(cur)
+ new_conf = {}
+ for entry in conf:
+ if entry[0] == 'host':
+ new_conf[('host', entry[1]['host_name'])] = entry[1]
+ elif entry[0] == 'service':
+ new_conf[(entry[1]['service_description'], entry[1]['host_name'])] = entry[1]
+ return new_conf
+
+def parse_status(file):
+ filename = file
+ conf = []
+ f = open(filename, 'r')
+ for i in f.readlines():
+ if i[0] == '#': continue
+ matchID = re.search(r"([\w]+) {", i)
+ matchAttr = re.search(r"[ ]*([\w]+)=([\w\d]*)", i)
+ matchEndID = re.search(r"[ ]*}", i)
+ if matchID:
+ identifier = matchID.group(1)
+ cur = [identifier, {}]
+ elif matchAttr:
+ attribute = matchAttr.group(1)
+ value = matchAttr.group(2)
+ cur[1][attribute] = value
+ elif matchEndID:
+ conf.append(cur)
+ new_conf = {}
+ for entry in conf:
+ if entry[0] == 'hoststatus':
+ new_conf[('host', entry[1]['host_name'])] = entry[1]
+ elif entry[0] == 'servicestatus':
+ new_conf[(entry[1]['service_description'], entry[1]['host_name'])] = entry[1]
+ return new_conf
+
+HOST_STATE_MSG = {
+ 0: 'up',
+ 1: 'down',
+ 2: 'unreachable'
+}
+SERVICE_STATE_MSG = {
+ 0: 'ok',
+ 1: 'warning',
+ 2: 'critical',
+ 3: 'unknown'
+}
+class TwilioNagios(Resource):
+ isLeaf = True
+
+ def __init__(self, objects, status):
+ self.objects = objects
+ self.status = status
+
+ def render(self, request):
+ request.setHeader( 'Content-Type', 'text/xml' )
+ status = parse_status(self.status)
+ conf = parse_objects(self.objects)
+ try:
+ type, service, name = request.postpath
+ status_data = status[(service,name)]
+ host_data = conf[('host',name)]
+ except (KeyError, ValueError):
+ return '<Response/>'
+
+ state = int(status_data['current_state'])
+ if type == 'service':
+ say = 'service %s on host %s is %s' % \
+ (service,
+ host_data['alias'],
+ SERVICE_STATE_MSG[state])
+
+ msg = host_data.get('notes', host_data['alias'])
+ if type == 'host':
+ if state == 1:
+ say = 'host %s is %s, I repeat, the host %s is %s' % \
+ (msg,
+ HOST_STATE_MSG[state],
+ msg,
+ HOST_STATE_MSG[state])
+ else:
+ say = 'the host %s is %s' % \
+ (msg,
+ HOST_STATE_MSG[state])
+
+ response = """
+<Response>
+ <Say>%s</Say>
+</Response> """ % (say) #(data['long_plugin_output'])
+ return response
27 twisted/plugins/twilio_nagios.py
@@ -0,0 +1,27 @@
+from zope.interface import implements
+
+from twisted.plugin import IPlugin
+from twisted.application.service import IServiceMaker
+from twisted.application import internet
+from twisted.web import server
+from twisted.python import usage
+
+from twilionagios import TwilioNagios
+
+class Options(usage.Options):
+ optParameters = [['port', 'p', 8080, 'port to run the twilio responder on'],
+ ['objects', 'o', '/var/cache/nagios3/objects.cache', 'location of nagios object cache'],
+ ['status', 's', '/var/cache/nagios3/status.dat', 'location of the nagios status data file']
+ ]
+
+class ServiceMaker(object):
+ implements(IServiceMaker, IPlugin)
+ tapname = 'twilio_nagios'
+ description = 'a nagios status parser that returns the data in twilio xml format'
+ options = Options
+
+ def makeService(self, options):
+ site = server.Site(TwilioNagios(options['objects'], options['status']))
+ return internet.TCPServer(int(options['port']),site)
+
+serviceMaker = ServiceMaker()

0 comments on commit 404ecf3

Please sign in to comment.