From 315eb948d5b7ef985d82738836c361e53cf65f4f Mon Sep 17 00:00:00 2001 From: Ryan LeCompte Date: Sun, 7 Oct 2012 16:12:41 -0700 Subject: [PATCH] add support for specifying required node managers to make decisions --- README.md | 28 ++++++---- examples/config.yml | 3 ++ lib/redis_failover/cli.rb | 5 ++ .../failover_strategy/latency.rb | 7 ++- lib/redis_failover/node_manager.rb | 52 +++++++++++++------ redis_failover.gemspec | 2 +- 6 files changed, 66 insertions(+), 31 deletions(-) diff --git a/README.md b/README.md index 3733287..a45cef3 100644 --- a/README.md +++ b/README.md @@ -68,17 +68,18 @@ following options: Specific options: - -n, --nodes NODES Comma-separated redis host:port pairs - -z, --zkservers SERVERS Comma-separated ZooKeeper host:port pairs - -p, --password PASSWORD Redis password - --znode-path PATH Znode path override for storing redis server list - --max-failures COUNT Max failures before manager marks node unavailable - -C, --config PATH Path to YAML config file - --with-chroot ROOT Path to ZooKeepers chroot - -E, --environment ENV Config environment to use - --node-strategy STRATEGY Strategy used when determining availability of nodes (default: majority) - --failover-strategy STRATEGY Strategy used when failing over to a new node (default: latency) - -h, --help Display all options + -n, --nodes NODES Comma-separated redis host:port pairs + -z, --zkservers SERVERS Comma-separated ZooKeeper host:port pairs + -p, --password PASSWORD Redis password + --znode-path PATH Znode path override for storing redis server list + --max-failures COUNT Max failures before manager marks node unavailable + -C, --config PATH Path to YAML config file + --with-chroot ROOT Path to ZooKeepers chroot + -E, --environment ENV Config environment to use + --node-strategy STRATEGY Strategy used when determining availability of nodes (default: majority) + --failover-strategy STRATEGY Strategy used when failing over to a new node (default: latency) + --required-node-managers COUNT Required Node Managers that must be reachable to determine node state (default: 1) + -h, --help Display all options To start the daemon for a simple master/slave configuration, use the following: @@ -171,6 +172,11 @@ When a failover happens, the primary Node Manager will now consult a "failover s strategy is provided by redis_failover: latency. This strategy simply selects a node that is both marked as available by all Node Managers and has the lowest average latency for its last health check. +Note that you should set the "required_node_managers" configuration option appropriately. This value (defaults to 1) is used to determine if enough Node +Managers have reported their view of a node's state. For example, if you have deployed 5 Node Managers, then you should set this value to 5 if you only +want to accept a node's availability when all 5 Node Managers are part of the snapshot. To give yourself flexibility, you may want to set this value to 3 +instead. This would give you flexibility to take down 2 Node Managers, while still allowing the cluster to be managed appropriately. + ## Documentation redis_failover uses YARD for its API documentation. Refer to the generated [API documentation](http://rubydoc.info/github/ryanlecompte/redis_failover/master/frames) for full coverage. diff --git a/examples/config.yml b/examples/config.yml index 9598cc9..2877c35 100644 --- a/examples/config.yml +++ b/examples/config.yml @@ -2,6 +2,9 @@ # redis_node_manager -C config.yml --- :max_failures: 2 +:node_strategy: majority +:failover_strategy: latency +:required_node_managers: 2 :nodes: - localhost:6379 - localhost:1111 diff --git a/lib/redis_failover/cli.rb b/lib/redis_failover/cli.rb index b552cc9..6eab4dc 100644 --- a/lib/redis_failover/cli.rb +++ b/lib/redis_failover/cli.rb @@ -58,6 +58,11 @@ def self.parse(source) options[:failover_strategy] = strategy end + opts.on('--required-node-managers COUNT', + 'Required Node Managers that must be reachable to determine node state (default: 1)') do |count| + options[:required_node_managers] = Integer(count) + end + opts.on('-h', '--help', 'Display all options') do puts opts exit diff --git a/lib/redis_failover/failover_strategy/latency.rb b/lib/redis_failover/failover_strategy/latency.rb index 3b53cde..d16ea81 100644 --- a/lib/redis_failover/failover_strategy/latency.rb +++ b/lib/redis_failover/failover_strategy/latency.rb @@ -1,17 +1,16 @@ module RedisFailover class FailoverStrategy - # Failover strategy that selects an availaboe node that is both seen by all + # Failover strategy that selects an available node that is both seen by all # node managers and has the lowest reported health check latency. class Latency < FailoverStrategy # @see RedisFailover::FailoverStrategy#find_candidate def find_candidate(snapshots) logger.info('Attempting to find candidate from snapshots:') logger.info("\n" + snapshots.values.join("\n")) - all_node_managers = Set.new - all_node_managers.merge(snapshots.values.map(&:node_managers).flatten) + candidates = {} snapshots.each do |node, snapshot| - if snapshot.available_count == all_node_managers.size + if snapshot.all_available? candidates[node] = snapshot.avg_latency end end diff --git a/lib/redis_failover/node_manager.rb b/lib/redis_failover/node_manager.rb index dfee14b..7e06418 100644 --- a/lib/redis_failover/node_manager.rb +++ b/lib/redis_failover/node_manager.rb @@ -39,11 +39,13 @@ class NodeManager def initialize(options) logger.info("Redis Node Manager v#{VERSION} starting (#{RUBY_DESCRIPTION})") @options = options + @required_node_managers = options.fetch(:required_node_managers, 1) @root_znode = options.fetch(:znode_path, Util::DEFAULT_ROOT_ZNODE_PATH) @node_strategy = NodeStrategy.for(options.fetch(:node_strategy, :majority)) @failover_strategy = FailoverStrategy.for(options.fetch(:failover_strategy, :latency)) @nodes = Array(@options[:nodes]).map { |opts| Node.new(opts) }.uniq @master_manager = false + @sufficient_node_managers = false @lock = Mutex.new @shutdown = false end @@ -214,7 +216,7 @@ def promote_new_master(snapshots, node = nil) def discover_nodes @lock.synchronize do return unless running? - @unavailable = [] + @slaves, @unavailable = [], [] if @master = find_existing_master logger.info("Using master #{@master} from existing znode config.") elsif @master = guess_master(@nodes) @@ -545,6 +547,7 @@ def wait_until_master logger.info('Acquired master Node Manager lock.') logger.info("Configured node strategy #{@node_strategy.class}") logger.info("Configured failover strategy #{@failover_strategy.class}") + logger.info("Required Node Managers to make a decision: #{@required_node_managers}") manage_nodes end end @@ -561,17 +564,19 @@ def manage_nodes # Periodically update master config state. while running? && master_manager? @zk_lock.assert! + sleep(CHECK_INTERVAL) + @lock.synchronize do snapshots = current_node_snapshots - snapshots.each_key do |node| - update_master_state(node, snapshots) - end + if ensure_sufficient_node_managers(snapshots) + snapshots.each_key do |node| + update_master_state(node, snapshots) + end - # flush current master state - write_current_redis_nodes + # flush current master state + write_current_redis_nodes + end end - - sleep(CHECK_INTERVAL) end end @@ -590,13 +595,6 @@ def with_lock @zk_lock ||= @zk.locker('master_redis_node_manager_lock') begin - # we manually attempt to delete the lock path before - # acquiring the lock, since currently the lock doesn't - # get cleaned up if there is a connection error while - # the client was previously blocked in the #lock! call. - if path = @zk_lock.lock_path - @zk.delete(path, :ignore => :no_node) - end @zk_lock.lock!(true) rescue Exception # handle shutdown case @@ -656,5 +654,29 @@ def running? def stringify_nodes(nodes) "(#{nodes.map(&:to_s).join(', ')})" end + + # Determines if each snapshot has a sufficient number of node managers. + # + # @param [Hash] snapshots the current snapshots + # @return [Boolean] true if sufficient, false otherwise + def ensure_sufficient_node_managers(snapshots) + currently_sufficient = true + snapshots.each do |node, snapshot| + node_managers = snapshot.node_managers + if node_managers.size < @required_node_managers + logger.error("Not enough Node Managers in snapshot for node #{node}. " + + "Required: #{@required_node_managers}, " + + "Available: #{node_managers.size} #{node_managers}") + currently_sufficient = false + end + end + + if currently_sufficient && !@sufficient_node_managers + logger.info("Can see all required Node Managers: #{@required_node_managers}") + end + + @sufficient_node_managers = currently_sufficient + @sufficient_node_managers + end end end diff --git a/redis_failover.gemspec b/redis_failover.gemspec index 9a9a259..d90e3b0 100644 --- a/redis_failover.gemspec +++ b/redis_failover.gemspec @@ -18,7 +18,7 @@ Gem::Specification.new do |gem| gem.add_dependency('redis', ['>= 2.2', '< 4']) gem.add_dependency('redis-namespace') gem.add_dependency('multi_json', '~> 1') - gem.add_dependency('zk', '~> 1.7') + gem.add_dependency('zk', ['>= 1.7.1', '< 1.8']) gem.add_development_dependency('rake') gem.add_development_dependency('rspec')