-
Notifications
You must be signed in to change notification settings - Fork 50
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
benchmark: add monitor command #593
Merged
Merged
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,192 @@ | ||
# This file is part of REANA. | ||
# Copyright (C) 2021 CERN. | ||
# | ||
# REANA is free software; you can redistribute it and/or modify it | ||
# under the terms of the MIT License; see LICENSE file for more details. | ||
|
||
"""Responsible for monitoring K8s cluster, DB connections.""" | ||
|
||
import json | ||
import time | ||
from pathlib import Path | ||
from typing import Dict, Any, List, Generator | ||
from collections import defaultdict | ||
import subprocess | ||
from abc import abstractmethod, ABC | ||
|
||
from reana.reana_benchmark.utils import get_utc_now_timestamp, logger | ||
|
||
|
||
class BaseMetric(ABC): | ||
"""Base class for other metrics.""" | ||
|
||
@property | ||
@abstractmethod | ||
def name(self) -> str: | ||
"""Name of the metric.""" | ||
raise NotImplementedError | ||
|
||
@abstractmethod | ||
def _collect(self, parameters: Dict) -> Any: | ||
raise NotImplementedError | ||
|
||
def collect(self, parameters: Dict) -> Dict[str, Any]: # noqa: D102 | ||
result = self._collect(parameters) | ||
return { | ||
self.name: result, | ||
} | ||
|
||
|
||
class WorkflowDBStatusesMetric(BaseMetric): | ||
"""Count number of workflows statuses directly from DB.""" | ||
|
||
name = "workflow_db_statuses" | ||
|
||
def _collect(self, parameters: Dict) -> Any: | ||
workflow_prefix = parameters.get("workflow") | ||
|
||
if not workflow_prefix: | ||
logger.warning( | ||
f"{self.name} metrics cannot find workflow parameter. Metric will not be collected." | ||
) | ||
return {} | ||
|
||
cmd = [ | ||
"kubectl", | ||
"exec", | ||
"deployment/reana-db", | ||
"--", | ||
"psql", | ||
"-U", | ||
"reana", | ||
"-c", | ||
f"SELECT status,COUNT(*) FROM __reana.workflow WHERE name LIKE '{workflow_prefix}-%' GROUP BY status;", | ||
] | ||
output = subprocess.check_output(cmd).decode("ascii") | ||
result = {} | ||
|
||
rows = output.splitlines()[2:-2] | ||
|
||
for row in rows: | ||
status, count = row.split("|")[0].strip(), int(row.split("|")[1].strip()) | ||
result[status] = count | ||
|
||
return result | ||
|
||
|
||
class NumberOfDBConnectionsMetric(BaseMetric): | ||
"""Count number of server processes in REANA DB.""" | ||
|
||
name = "db_connections_number" | ||
|
||
def _collect(self, parameters: Dict) -> Any: | ||
cmd = [ | ||
"kubectl", | ||
"exec", | ||
"deployment/reana-db", | ||
"--", | ||
"psql", | ||
"-U", | ||
"reana", | ||
"-c", | ||
"SELECT COUNT(*) FROM pg_stat_activity;", | ||
] | ||
output = subprocess.check_output(cmd).decode("ascii") | ||
result = int(output.splitlines()[2].strip()) | ||
return result | ||
|
||
|
||
class WorkflowPodsMetric(BaseMetric): | ||
"""Count number of job and batch jobs in different phases.""" | ||
|
||
name = "workflows_pods_status" | ||
|
||
@staticmethod | ||
def _filter(pods: List[Dict], name_contains: str) -> Generator[Dict, None, None]: | ||
for pod in pods: | ||
name = pod.get("metadata", {}).get("name", "") | ||
if name_contains in name: | ||
yield pod | ||
|
||
@staticmethod | ||
def _count(pods: List[Dict], name_contains: str) -> Dict[str, int]: | ||
statistics = defaultdict(lambda: 0) | ||
for pod in WorkflowPodsMetric._filter(pods, name_contains): | ||
phase = pod.get("status", {}).get("phase") | ||
statistics[phase] += 1 | ||
return dict(statistics) | ||
|
||
def _collect(self, parameters: Dict) -> Any: | ||
kubectl_cmd = ("kubectl", "get", "pods", "-o", "json") | ||
VMois marked this conversation as resolved.
Show resolved
Hide resolved
|
||
output = subprocess.check_output(kubectl_cmd) | ||
pods = json.loads(output).get("items", []) | ||
|
||
result = { | ||
"batch_pods": self._count(pods, "run-batch"), | ||
"job_pods": self._count(pods, "run-job"), | ||
} | ||
|
||
return result | ||
|
||
|
||
METRICS = [ | ||
NumberOfDBConnectionsMetric(), | ||
WorkflowPodsMetric(), | ||
WorkflowDBStatusesMetric(), | ||
] | ||
|
||
|
||
def _build_monitored_results_path(workflow: str) -> Path: | ||
return Path(f"{workflow}_monitored_results.json") | ||
|
||
|
||
def _save_metrics(workflow: str, results: Dict) -> None: | ||
with open(_build_monitored_results_path(workflow), "w") as f: | ||
json.dump(results, f) | ||
|
||
|
||
def _collect_metrics(parameters: Dict) -> Dict[str, Any]: | ||
collected_metrics = {} | ||
for metric in METRICS: | ||
try: | ||
result = metric.collect(parameters) | ||
collected_metrics = dict(collected_metrics, **result) | ||
except Exception as error: | ||
logger.error( | ||
f"Error during collection of {metric.name} metric. Details: {error}" | ||
) | ||
return collected_metrics | ||
|
||
|
||
def _print_metrics() -> None: | ||
logger.info("Following metrics will be collected:") | ||
for m in METRICS: | ||
logger.info(f"- {m.name}") | ||
|
||
|
||
def monitor(workflow: str, sleep: int) -> None: | ||
"""Start periodically collect defined metrics and save them to JSON file. | ||
|
||
This function is blocking. | ||
""" | ||
_print_metrics() | ||
logger.info("Starting monitoring...") | ||
|
||
all_metrics = {} | ||
metrics_parameters = { | ||
"workflow": workflow, | ||
} | ||
|
||
try: | ||
while True: | ||
# if metrics will take, for example, couple of seconds to collect monitored_date will be less accurate | ||
monitored_date = get_utc_now_timestamp() | ||
collected_metrics = _collect_metrics(metrics_parameters) | ||
all_metrics[monitored_date] = collected_metrics | ||
_save_metrics(workflow, all_metrics) | ||
|
||
time.sleep(sleep) | ||
except KeyboardInterrupt: | ||
logger.info("Stopping monitoring...") | ||
finally: | ||
_save_metrics(workflow, all_metrics) |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
question: The
phase
value is not representing exactly whatkubectl get pods
(STATUS
column) is displaying. It hasPending
,Running
andSucceed
statuses. In addition, it hasFailed
status which, for some reason, appears for batch pods at the end when workflows are successfully finished.I guess this happens because batch pods enter
NotReady
state because the workflow engine successfully finished and exits. You cannot easily getNotReady
state from kubectl JSON output, you need to deduct it from container statuses.What do you think about the situation? Any ideas on how to get a more accurate status?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
phase
values are described here. I think it's a pretty accurate metric for us to use. But this case with successfully finishedFailed
status could be confusing.. I guess one way how we could solve it is to include thecontainerStatuses
to the final output (downsize is that it will make it much more verbose..). Here is an example:Then it would be visible that the
Failed
status happens because the containers terminated. But not sure if we need it, could be confusing to have too much output..There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Example of a JSON output about pod
status
for a single pod fromkubectl get pods -o json
command:As described above
phase
can show pods asFailed
while they are terminating and it is not so straightforward how to extract more precise info from the above JSON. Some checks can be done oncontainerStatuses
or, maybe, inconditions
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The current goal for monitoring command is to collect data that later can be visualized along with execution progress plot. With additional
containerStatuses
output it is not clear how to visualize it (there is aphase
but also one or more containers incontainerStatuses
). I would prefer to reduce it to one field (Running: 5
,Failed: 1
, etc.) but the question is how :)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If I understood correctly when the pod is terminated
SIGTERM
is sent by k8s and apparentlyjob controller
takes too much time to exit so status k8s sendsSIGKILL
(code: 137) and marks pod asFailed
.Example of JSON:
Check timestamps in
containerStatuses
.workflow-engine
finished at10:28:02
so around that time REANA asks k8s to delete pod. Take a look atjob-controller
finish time -10:28:32
. Exactly 30 seconds afterworkflow-engine
, 30 seconds is a default time in k8s for graceful shutdown of a pod (details).The same happened in other batch pods -
job controller
is not terminating within 30 seconds when k8s asks for it. @tiborsimkosuggestion: I think the above case deserves a separate issue. I can leave the
phase
value in the metric for this PR. I assume when we will figure out why the exit code is137
instead of0
, thephase
value will showSucceeded
status for batch pods.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Opened the issue to fix
Failed
phase for batch pods. We can proceed forward with this PR and leavephase
as it is.