Skip to content
This repository has been archived by the owner on Mar 14, 2019. It is now read-only.

Redesign monitoring internals #30

Merged
merged 4 commits into from
Jan 31, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
30 changes: 30 additions & 0 deletions lib/monitoring/check_result.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
module Monitoring
class CheckResult
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This now captures the contract between checker and notifier. Before it was basically whatever and so it resulted in tight coupling between some checkers and notifiers.

attr_reader :resque_name, # Name of the resque the check checked
:scope, # Refined scope inside this resque, if relavent (e.g. queue name or class name)
:check_name, # Name of the thing checked
:check_count # Count related to what was being checked

def initialize(resque_name: nil,check_name: nil,check_count: nil, scope: nil)
@resque_name = required! resque_name, "resque_name"
@check_name = required! check_name, "check_name"
@check_count = required! check_count, "check_count", :to_i
@scope = optional scope
end

private

def required!(value, name, conversion = :to_s)
raise ArgumentError, "#{name} is required" if value.nil?
value.send(conversion)
end

def optional(value)
if value.nil?
nil
else
value.to_s
end
end
end
end
1 change: 1 addition & 0 deletions lib/monitoring/checker.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ class Checker
def initialize(resques: RESQUES)
@resques = resques
end
# Should return an array of CheckResult representing the results of the check
def check!
raise
end
Expand Down
8 changes: 5 additions & 3 deletions lib/monitoring/failed_job_check.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@
module Monitoring
class FailedJobCheck < Monitoring::Checker
def check!
Hash[@resques.all.map { |resque_instance|
[resque_instance.name,resque_instance.jobs_failed]
}]
@resques.all.map { |resque_instance|
CheckResult.new(resque_name: resque_instance.name,
check_name: "resque.failed_jobs",
check_count: resque_instance.jobs_failed.size)
}
end
end
end
10 changes: 5 additions & 5 deletions lib/monitoring/librato_notifier.rb
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
module Monitoring
class LibratoNotifier < Notifier
def initialize(prefix: nil, logger: Rails.logger, type: :count, unit: "")
@prefix = validate_prefix!(prefix)
def initialize(logger: Rails.logger, type: :count, unit: "")
@logger = logger
@type = type
@unit = unit || ""
Expand All @@ -11,9 +10,10 @@ def initialize(prefix: nil, logger: Rails.logger, type: :count, unit: "")
#
# results:: a hash where the keys represent the source (the resque instance name) and the values
# are lists of items to be counted. The items won't be examined, just counted and used in the metric
def notify!(results)
results.each do |resque_name,items|
log_to_librato(resque_name,@type,@prefix,items.size)
def notify!(check_results)
check_results.each do |check_result|
source = [check_result.resque_name,check_result.scope].compact.join(".")
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is how the scope is used. Bascially if we think of the resque name as the source, the scope is a refinement inside that resque of what the source is. In the case of the queue size check, it uses the name of the queue as the scope (so, in theory, we could alert on different queue sizes differently, which is what i intend to do with the failed job classes)

log_to_librato(source,@type,check_result.check_name,check_result.check_count)
end
end

Expand Down
3 changes: 2 additions & 1 deletion lib/monitoring/monitor.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
module Monitoring
class Monitor
def initialize(checker: checker,notifier: notifier)
def initialize(checker: nil,notifier: nil)
raise ArgumentError, "both checker and notifier are required" if checker.nil? || notifier.nil?
@checker = checker
@notifier = notifier
end
Expand Down
4 changes: 3 additions & 1 deletion lib/monitoring/notifier.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
module Monitoring
class Notifier
def notify!(results)

# This will be given an array of CheckResult instances
def notify!(check_results)
raise
end
end
Expand Down
16 changes: 0 additions & 16 deletions lib/monitoring/per_queue_librato_notifier.rb

This file was deleted.

12 changes: 9 additions & 3 deletions lib/monitoring/queue_size_check.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,15 @@
module Monitoring
class QueueSizeCheck < Monitoring::Checker
def check!
Hash[@resques.all.map { |resque_instance|
[resque_instance.name,resque_instance.jobs_waiting]
}]
@resques.all.map { |resque_instance|
resque_instance.jobs_waiting.keys.sort.map { |queue_name|
jobs = resque_instance.jobs_waiting[queue_name]
CheckResult.new(resque_name: resque_instance.name,
scope: queue_name,
check_name: "resque.queue_size",
check_count: jobs.size)
}
}.flatten
end
end
end
8 changes: 5 additions & 3 deletions lib/monitoring/stale_worker_check.rb
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
module Monitoring
class StaleWorkerCheck < Monitoring::Checker
def check!
Hash[@resques.all.map { |resque_instance|
[resque_instance.name,resque_instance.jobs_running.select(&:too_long?)]
}]
@resques.all.map { |resque_instance|
CheckResult.new(resque_name: resque_instance.name,
check_name: "resque.stale_workers",
check_count: resque_instance.jobs_running.select(&:too_long?).size)
}
end
end
end
6 changes: 3 additions & 3 deletions lib/tasks/monitor.rake
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@ namespace :monitor do
task :failed => :environment do
monitor = Monitoring::Monitor.new(
checker: Monitoring::FailedJobCheck.new,
notifier: Monitoring::LibratoNotifier.new(prefix: "resque.failed_jobs", unit: "jobs"))
notifier: Monitoring::LibratoNotifier.new(unit: "jobs"))
monitor.monitor!
end

desc "Check the number of stale workers and stat the results to the log in a way Librato can understand"
task :stale_workers => :environment do
monitor = Monitoring::Monitor.new(
checker: Monitoring::StaleWorkerCheck.new,
notifier: Monitoring::LibratoNotifier.new(prefix: "resque.stale_workers", type: :measure, unit: "workers"))
notifier: Monitoring::LibratoNotifier.new(type: :measure, unit: "workers"))
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

prefix was no longer needed as the CheckResult can control that


monitor.monitor!
end
Expand All @@ -20,7 +20,7 @@ namespace :monitor do
task :queue_sizes => :environment do
monitor = Monitoring::Monitor.new(
checker: Monitoring::QueueSizeCheck.new,
notifier: Monitoring::PerQueueLibratoNotifier.new(prefix: "resque.queue_size", type: :count, unit: "jobs"))
notifier: Monitoring::LibratoNotifier.new(unit: "jobs"))

monitor.monitor!
end
Expand Down
8 changes: 4 additions & 4 deletions test/integration/monitoring_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,9 @@ class MonitoringTest < ActionDispatch::IntegrationTest

Rake::Task['monitor:queue_sizes'].invoke

assert_equal "source=test1.cache count#resque.queue_size=2jobs" , logger.infos[0]
assert_equal "source=test1.mail count#resque.queue_size=4jobs" , logger.infos[1]
assert_equal "source=test2.admin count#resque.queue_size=2jobs" , logger.infos[2]
assert_equal "source=test2.mail count#resque.queue_size=1jobs" , logger.infos[3]
assert_equal "source=test1.cache count#resque.queue_size=2jobs" , logger.infos[0], "0" + logger.infos.inspect
assert_equal "source=test1.mail count#resque.queue_size=4jobs" , logger.infos[1], "1" + logger.infos.inspect
assert_equal "source=test2.admin count#resque.queue_size=2jobs" , logger.infos[2], "2" + logger.infos.inspect
assert_equal "source=test2.mail count#resque.queue_size=1jobs" , logger.infos[3], "3" + logger.infos.inspect
end
end
21 changes: 12 additions & 9 deletions test/lib/monitoring/failed_job_check_test.rb
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
require 'quick_test_helper'
require 'support/resque_helpers'
require 'support/monitoring_helpers'
require 'minitest/autorun'
require 'resque'

lib_require 'monitoring/checker'
lib_require 'monitoring/check_result'
lib_require 'monitoring/failed_job_check'
rails_require 'models/resque_instance'
rails_require 'models/job'
Expand All @@ -14,12 +16,13 @@ module Monitoring
end
class Monitoring::FailedJobCheckTest < MiniTest::Test
include ResqueHelpers
include MonitoringHelpers

def setup_resques(test1: 1, test2: 2)
def setup_resques(test1: ["BazJob"], test2: ["FooJob","BarJob"])
Redis.new.flushall
Resques.new([
add_failed_jobs(num_failed: test1, resque_instance: resque_instance("test1",:resque)),
add_failed_jobs(num_failed: test2, resque_instance: resque_instance("test2",:resque2)),
add_failed_jobs(job_class_names: test1, resque_instance: resque_instance("test1",:resque)),
add_failed_jobs(job_class_names: test2, resque_instance: resque_instance("test2",:resque2)),
])
end

Expand All @@ -28,22 +31,22 @@ def test_type
end

def test_failed_jobs
resques = setup_resques(test1: 1, test2: 2)
resques = setup_resques
check = Monitoring::FailedJobCheck.new(resques: resques)

results = check.check!

assert_equal 1,results["test1"].size,results["test1"].inspect
assert_equal 2,results["test2"].size,results["test2"].inspect
assert_check_result results[0], resque_name: "test1", check_count: 1
assert_check_result results[1], resque_name: "test2", check_count: 2
end

def test_no_failed_jobs
resques = setup_resques(test1: 0, test2: 0)
resques = setup_resques(test1: [], test2: [])
check = Monitoring::FailedJobCheck.new(resques: resques)

results = check.check!

assert_equal 0,results["test1"].size,results.inspect
assert_equal 0,results["test2"].size,results.inspect
assert_check_result results[0], resque_name: "test1", check_count: 0
assert_check_result results[1], resque_name: "test2", check_count: 0
end
end
51 changes: 27 additions & 24 deletions test/lib/monitoring/librato_notifier_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
require 'support/fake_logger'

lib_require 'monitoring/notifier'
lib_require 'monitoring/check_result'
lib_require 'monitoring/librato_notifier'

module Monitoring
Expand All @@ -14,26 +15,14 @@ def test_type
assert Monitoring::LibratoNotifier.ancestors.include?(Monitoring::Notifier)
end

def test_requires_a_prefix
assert_raises ArgumentError do
Monitoring::LibratoNotifier.new(logger: FakeLogger.new)
end
end

def test_prefix_should_just_have_alpha_nums_and_dots
assert_raises ArgumentError do
Monitoring::LibratoNotifier.new(prefix: "foo bar", logger: FakeLogger.new)
end
end

def test_logs_results
logger = FakeLogger.new
notifier = Monitoring::LibratoNotifier.new(prefix: "foo.bar", logger: logger, unit: "jobs")
notifier.notify!({
"test1" => [ Object.new, Object.new, Object.new ],
"test2" => [ Object.new ],
"test3" => [],
})
notifier = Monitoring::LibratoNotifier.new(logger: logger, unit: "jobs")
notifier.notify!([
Monitoring::CheckResult.new(resque_name: "test1", check_name: "foo.bar", check_count: 3),
Monitoring::CheckResult.new(resque_name: "test2", check_name: "foo.bar", check_count: 1),
Monitoring::CheckResult.new(resque_name: "test3", check_name: "foo.bar", check_count: 0),
])

assert_equal "source=test1 count#foo.bar=3jobs", logger.infos[0]
assert_equal "source=test2 count#foo.bar=1jobs", logger.infos[1]
Expand All @@ -42,15 +31,29 @@ def test_logs_results

def test_logs_results_as_measure
logger = FakeLogger.new
notifier = Monitoring::LibratoNotifier.new(prefix: "foo.bar", logger: logger, type: :measure, unit: "workers")
notifier.notify!({
"test1" => [ Object.new, Object.new, Object.new ],
"test2" => [ Object.new ],
"test3" => [],
})
notifier = Monitoring::LibratoNotifier.new(logger: logger, type: :measure, unit: "workers")
notifier.notify!([
Monitoring::CheckResult.new(resque_name: "test1", check_name: "foo.bar", check_count: 3),
Monitoring::CheckResult.new(resque_name: "test2", check_name: "foo.bar", check_count: 1),
Monitoring::CheckResult.new(resque_name: "test3", check_name: "foo.bar", check_count: 0),
])

assert_equal "source=test1 measure#foo.bar=3workers", logger.infos[0]
assert_equal "source=test2 measure#foo.bar=1workers", logger.infos[1]
assert_equal "source=test3 measure#foo.bar=0workers", logger.infos[2]
end

def test_logs_results_with_scope
logger = FakeLogger.new
notifier = Monitoring::LibratoNotifier.new(logger: logger, unit: "workers")
notifier.notify!([
Monitoring::CheckResult.new(resque_name: "test1", scope: "baz", check_name: "foo.bar", check_count: 3),
Monitoring::CheckResult.new(resque_name: "test2", scope: "baz", check_name: "foo.bar", check_count: 1),
Monitoring::CheckResult.new(resque_name: "test3", scope: "baz", check_name: "foo.bar", check_count: 0),
])

assert_equal "source=test1.baz count#foo.bar=3workers", logger.infos[0]
assert_equal "source=test2.baz count#foo.bar=1workers", logger.infos[1]
assert_equal "source=test3.baz count#foo.bar=0workers", logger.infos[2]
end
end
54 changes: 0 additions & 54 deletions test/lib/monitoring/per_queue_librato_notifier_test.rb

This file was deleted.