Permalink
Browse files

Initial commit.

  • Loading branch information...
0 parents commit 4970399184a9dbec5f4aa247ccfde43b2b9e0dbc @aphyr aphyr committed Feb 19, 2012
Showing with 323 additions and 0 deletions.
  1. +6 −0 .gitignore
  2. +21 −0 LICENSE
  3. +18 −0 README.markdown
  4. +42 −0 Rakefile.rb
  5. +67 −0 bin/reimann-bench
  6. +165 −0 bin/reimann-health
  7. +4 −0 lib/reimann/tools.rb
@@ -0,0 +1,6 @@
+pkg/
+._*
+*~
+.DS_Store
+.*.swp
+*.log
21 LICENSE
@@ -0,0 +1,21 @@
+The MIT License
+
+Copyright (c) 2011 Kyle Kingsbury
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
@@ -0,0 +1,18 @@
+Reimann Tools
+=============
+
+Tiny programs to submit events to Reimann.
+
+Reimann-health, for example, submits events about the current CPU, load,
+memory, and disk use. Bench submits randomly distributed metrics for load
+testing. I've got a whole bunch of these internally for monitoring Redis, Riak,
+queues, etc. Most have internal configuration dependencies, so it'll be a while
+before I can extract them for re-use.
+
+Get started
+==========
+
+``` bash
+gem install reimann-tools
+reimann-health --host my.reimann.server
+```
@@ -0,0 +1,42 @@
+require 'rubygems'
+require 'rubygems/package_task'
+require 'rdoc/task'
+require 'find'
+
+# Don't include resource forks in tarballs on Mac OS X.
+ENV['COPY_EXTENDED_ATTRIBUTES_DISABLE'] = 'true'
+ENV['COPYFILE_DISABLE'] = 'true'
+
+# Gemspec
+gemspec = Gem::Specification.new do |s|
+ s.rubyforge_project = 'reimann-tools'
+
+ s.name = 'reimann-tools'
+ s.version = '0.0.1'
+ s.author = 'Kyle Kingsbury'
+ s.email = 'aphyr@aphyr.com'
+ s.homepage = 'https://github.com/aphyr/reimann-tools'
+ s.platform = Gem::Platform::RUBY
+ s.summary = 'HTTP dashboard for the distributed event system Reimann.'
+
+ s.add_dependency 'reimann-client', '>= 0.0.4'
+ s.add_dependency 'trollop', '>= 1.16.2'
+
+ s.files = FileList['lib/**/*', 'bin/*', 'LICENSE', 'README.markdown'].to_a
+ s.executables |= Dir.entries('bin/')
+ s.require_path = 'lib'
+ s.has_rdoc = true
+
+ s.required_ruby_version = '>= 1.9.1'
+end
+
+Gem::PackageTask.new gemspec do |p|
+end
+
+RDoc::Task.new do |rd|
+ rd.main = 'Reimann Tools'
+ rd.title = 'Reimann Tools'
+ rd.rdoc_dir = 'doc'
+
+ rd.rdoc_files.include('lib/**/*.rb')
+end
@@ -0,0 +1,67 @@
+#!/usr/bin/env ruby
+
+# Connects to a server (first arg) and populates it with a constant stream of
+# events for testing.
+
+require 'reimann/client'
+require 'pp'
+
+class Reimann::Bench
+ attr_accessor :client, :hosts, :services, :states
+ def initialize
+ @hosts = [nil] + (0...10).map { |i| "host#{i}" }
+ @hosts = ['test']
+ @services = %w(per)
+ @states = {}
+ @client = Reimann::Client.new(host: (ARGV.first || 'localhost'))
+ end
+
+ def evolve(state)
+ m = rand
+ s = case m
+ when 0...0.75
+ 'ok'
+ when 0.75...0.9
+ 'warning'
+ when 0.9..1.0
+ 'critical'
+ end
+
+ {
+ metric_f: m,
+ state: s,
+ host: state[:host],
+ service: state[:service],
+ description: "at #{Time.now}"
+ }
+ end
+
+ def tick
+# pp @states
+ hosts.product(services).each do |id|
+ client << (states[id] = evolve(states[id]))
+ end
+ end
+
+ def run
+ start
+ loop do
+# sleep 0.01
+ tick
+ end
+ end
+
+ def start
+ hosts.product(services).each do |host, service|
+ states[[host, service]] = {
+ metric_f: 0.5,
+ state: 'ok',
+ description: "Starting up",
+ host: host,
+ service: service
+ }
+ end
+ end
+end
+
+Reimann::Bench.new.run
@@ -0,0 +1,165 @@
+#!/usr/bin/env ruby
+
+# Reports current CPU, disk, load average, and memory use to reimann.
+
+require 'trollop'
+require 'reimann/client'
+
+class Reimann::Health
+ def initialize(opts)
+ @host = opts[:host]
+ @port = opts[:port]
+ @interval = opts[:interval]
+ @limits = {
+ cpu: {critical: opts[:cpu_critical], warning: opts[:cpu_warning]},
+ disk: {critical: opts[:disk_critical], warning: opts[:disk_warning]},
+ :load => {critical: opts[:load_critical], warning: opts[:load_warning]},
+ memory: {critical: opts[:memory_critical], warning: opts[:memory_warning]}
+ }
+
+ @client = Reimann::Client.new(:host => @host, :port => @port)
+ end
+
+ def alert(service, state, metric, description)
+ @client << {
+ service: service,
+ state: state.to_s,
+ metric: metric.to_f,
+ description: description
+ }
+ end
+
+ def cores
+ i = 0;
+ File.read("/proc/cpuinfo").split(/\n\n/).inject({}) do |cores, p|
+ physical_id = p[/physical id\s+:\s+(\d+)/, 1]
+ core_id = p[/core id\s+:\s+(\d+)/, 1]
+ if physical_id and core_id
+ cores["#{physical_id}:#{core_id}"] = true
+ elsif physical_id
+ cores["#{physical_id}:"] = true
+ else
+ cores[i += 1] = true;
+ end
+
+ cores
+ end.size
+ end
+
+ def cpu
+ new = File.read('/proc/stat')
+ unless new[/cpu\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)/]
+ alert 'cpu', :unknown, nil, "/proc/stat doesn't include a CPU line"
+ return false
+ end
+ u2, n2, s2, i2 = [$1, $2, $3, $4].map { |e| e.to_i }
+
+ if @old_cpu
+ u1, n1, s1, i1 = @old_cpu
+
+ used = (u2+n2+s2) - (u1+n1+s1)
+ total = used + i2-i1
+ fraction = used.to_f / total
+
+ if fraction > @limits[:cpu][:critical]
+ alert "cpu", :critical, fraction, "#{sprintf("%.2f", fraction * 100)}% user+nice+sytem\n\n#{cpu_report}"
+ elsif fraction > @limits[:cpu][:warning]
+ alert "cpu", :warning, fraction, "#{sprintf("%.2f", fraction * 100)}% user+nice+sytem\n\n#{cpu_report}"
+ else
+ alert "cpu", :ok, fraction, "#{sprintf("%.2f", fraction * 100)}% user+nice+sytem\n\n#{cpu_report}"
+ end
+ end
+
+ @old_cpu = [u2, n2, s2, i2]
+ end
+
+ def cpu_report
+ `ps -eo pcpu,pid,args | sort -nrb -k1 | head -10`.chomp
+ end
+
+ def disk
+ `df`.split(/\n/).each do |r|
+ f = r.split(/\s+/)
+ next unless f[0] =~ /^\//
+ next if f[0] == 'Filesystem'
+ x = f[4].to_f/100
+
+ if x > @limits[:disk][:critical]
+ alert "disk #{f[5]}", :critical, x, "#{f[4]} used"
+ elsif x > @limits[:disk][:warning]
+ alert "disk #{f[5]}", :warning, x, "#{f[4]} used"
+ else
+ alert "disk #{f[5]}", :ok, x, "#{f[4]} used"
+ end
+ end
+ end
+
+ def load
+ load = File.read('/proc/loadavg').split(/\s+/)[2].to_f / cores
+ if load > @limits[:load][:critical]
+ alert "load", :critical, load, "15-minute load average/core is #{load}"
+ elsif load > @limits[:load][:warning]
+ alert "load", :warning, load, "15-minute load average/core is #{load}"
+ else
+ alert "load", :ok, load, "15-minute load average/core is #{load}"
+ end
+ end
+
+ def memory
+ m = File.read('/proc/meminfo').split(/\n/).inject({}) { |info, line|
+ x = line.split(/:?\s+/)
+ # Assume kB...
+ info[x[0]] = x[1].to_i
+ info
+ }
+
+ free = m['MemFree'] + m['Buffers'] + m['Cached']
+ total = m['MemTotal']
+ fraction = 1 - (free.to_f / total)
+
+ if fraction > @limits[:memory][:critical]
+ alert "memory", :critical, fraction, "#{sprintf("%.2f", fraction * 100)}% used\n\n#{memory_report}"
+ elsif fraction > @limits[:memory][:warning]
+ alert "memory", :warning, fraction, "#{sprintf("%.2f", fraction * 100)}% used\n\n#{memory_report}"
+ else
+ alert "memory", :ok, fraction, "#{sprintf("%.2f", fraction * 100)}% used\n\n#{memory_report}"
+ end
+ end
+
+ def memory_report
+ `ps -eo pmem,pid,args | sort -nrb -k1 | head -10`.chomp
+ end
+
+ def tick
+ begin
+ cpu
+ memory
+ load
+ disk
+ rescue => e
+ $stderr.puts "#{e.class} #{e}\n#{e.backtrace.join "\n"}"
+ sleep 10
+ end
+ end
+
+ def run
+ loop do
+ tick
+ sleep @interval
+ end
+ end
+end
+
+Reimann::Health.new(Trollop.options do
+ opt :host, "Host", :default => '127.0.0.1'
+ opt :port, "Port", :default => 5555
+ opt :interval, "Seconds between updates", :default => 5
+ opt :cpu_warning, "CPU warning threshold (fraction of total jiffies)", :default => 0.9
+ opt :cpu_critical, "CPU critical threshold (fraction of total jiffies)", :default => 0.95
+ opt :disk_warning, "Disk warning threshold (fraction of space used)", :default => 0.9
+ opt :disk_critical, "Disk critical threshold (fraction of space used)", :default => 0.95
+ opt :load_warning, "Load warning threshold (load average / core)", :default => 3
+ opt :load_critical, "Load critical threshold (load average / core)", :default => 8
+ opt :memory_warning, "Memory warning threshold (fraction of RAM)", :default => 0.85
+ opt :memory_critical, "Memory critical threshold (fraction of RAM)", :default => 0.95
+end).run
@@ -0,0 +1,4 @@
+module Reimann
+ module Tools
+ end
+end

0 comments on commit 4970399

Please sign in to comment.