Skip to content

Commit

Permalink
HostManager: change how stability factor is computed
Browse files Browse the repository at this point in the history
This is based off of *unexpected* failures instead of just all start
times.
  • Loading branch information
mhasself committed Feb 2, 2022
1 parent c1188e6 commit dcba005
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 13 deletions.
6 changes: 3 additions & 3 deletions agents/host_manager/host_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ def _update_target_states(self, session, requests=[],
'prot': prot,
'full_name': ('%s:%s' % tuple(k)),
'agent_script': agent_script,
'start_times': [],
'fail_times': [],
}

# Special requests will target specific instance_id; make a map for that.
Expand Down Expand Up @@ -343,8 +343,8 @@ def manager(self, session, params):
any_jobs = (any_jobs or (db['next_action'] != 'down'))

# Criteria for stability:
db['start_times'], db['stability'] = hm_utils.stability_factor(
db['start_times'])
db['fail_times'], db['stability'] = hm_utils.stability_factor(
db['fail_times'])

# Clean up retired items.
self.database = {k:v for k,v in self.database.items()
Expand Down
23 changes: 13 additions & 10 deletions ocs/agent/host_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,6 @@ def resolve_child_state(db):
db['next_action'] = 'wait_start'
now = time.time()
db['at'] = now + 1.
db['start_times'].append(now)
elif db['next_action'] == 'up':
stat, t = prot.status
if stat is not None:
Expand All @@ -103,6 +102,7 @@ def resolve_child_state(db):
.format('\n'.join(lines), note=note, **db))
db['next_action'] = 'start_at'
db['at'] = time.time() + 3
db['fail_times'].append(time.time())
else: # 'down'
db['next_action'] = 'start'

Expand Down Expand Up @@ -148,22 +148,25 @@ def resolve_child_state(db):


def stability_factor(times, window=120):
"""Given an increasing list of start times, the last one corresponding
to the present run, decide whether the process the activity is
running stably or not.
"""Given an increasing list of failure times, quantify the stability
of the activity.
Returns a culled list of start times and a stability factor (0 -
1). A stable agent will settle to stability factor of 1 within
window seconds. An unstable agent will have stability factor of
0.5 or less.
A single failure, 10 seconds in the past, has a stability factor
of 0.5; if there were additional failures before that, the
stability factor will be lower.
Returns a culled list of stop times and a stability factor (0 -
1).
"""
now = time.time()
if len(times) == 0:
return times, -1.
return times, 1.
# Only keep the last few failures, within our time window.
times = [t for t in times[-200:-1]
if t >= now - window] + times[-1:]
return times, 1./len(times)
dt = [5./(now - t) for t in times]
return times, max(1 - sum(dt), 0.)


class AgentProcessHelper(protocol.ProcessProtocol):
Expand Down

0 comments on commit dcba005

Please sign in to comment.