Skip to content

Commit

Permalink
Failed Scheduled Queries Report (getredash#3798)
Browse files Browse the repository at this point in the history
* initial work on e-mail report for failed queries

* send failure report only for scheduled queries and not for adhoc queries

* add setting to determine if to send failure reports

* add setting to determine interval of aggregated e-mail report

* html templating of scheduled query failure report

* break line

* support timeouts for failure reports

* aggregate errors within message and warn if approaching threshold

* handle errors in QueryExecutor.run instead of on_failure

* move failure report to its own module

* indicate that failure count is since last report

* copy changes

* format with <code>

* styling, copy and add a link to the query instead of the query text

* separate reports with <hr>

* switch to UTC

* move <h2> to actual e-mail subject

* add explicit message for SoftTimeLimitExceeded

* fix test to use soft time limits

* default query failure threshold to 100

* use base_url from utils

* newlines. newlines everywhere.

* remove redundant import

* apply new design for failure report

* use jinja to format the failure report

* don't show comment block if no comment is provided

* don't send emails if, for some reason, there are no available errors

* subtract 1 from failure count, because the first one is represented by 'Last failed'

* don't show '+X more failures' if there's only one

* extract subject to variable

* format as text, while we're at it

* allow scrolling for long exception messages

* test that e-mails are scheduled only  when beneath limit

* test for indicating when approaching report limits + refactor

* test that failures are aggregated

* test that report counts per query and reason

* test that the latest failure occurence is reported

* force sending reports for testing purposes

* Update redash/templates/emails/failures.html

Co-Authored-By: Ran Byron <ranbena@gmail.com>

* Update redash/templates/emails/failures.html

Co-Authored-By: Ran Byron <ranbena@gmail.com>

* Update redash/tasks/failure_report.py

* add org setting for email reports

* remove logo from failure report email

* correctly use the organization setting for sending failure reports

* use user id as key for failure reports data structure

* Update redash/tasks/failure_report.py

Co-Authored-By: Arik Fraimovich <arik@arikfr.com>

* build comments while creating context for e-mail templates

* figure out the base url when creating the e-mail

* no need to expire pending failure report keys as they are deleted anyway when sent

* a couple of CodeClimate changes

* refactor key creationg to a single location

* refactor tests to send e-mail from a single function

* use beat to schedule a periodic send_aggregated_errors task instead of using countdown per email

* remove pending key as it is no longer required when a periodic task picks up the reports to send

* a really important blank line. REALLY important.

* Revert "a really important blank line. REALLY important."

This reverts commit c7d8ed8.

* a really important blank line. REALLY important. It is the best blank line.

* don't send failure emails to disabled users
  • Loading branch information
Omer Lachish authored and harveyrendell committed Nov 14, 2019
1 parent 57ed300 commit 7466fd2
Show file tree
Hide file tree
Showing 11 changed files with 258 additions and 5 deletions.
8 changes: 8 additions & 0 deletions client/app/pages/settings/OrganizationSettings.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,14 @@ class OrganizationSettings extends React.Component {
Enable multi-byte (Chinese, Japanese, and Korean) search for query names and descriptions (slower)
</Checkbox>
</Form.Item>
<Form.Item label="Email Reports">
<Checkbox
name="send_email_on_failed_scheduled_queries"
checked={formValues.send_email_on_failed_scheduled_queries}
onChange={e => this.handleChange('send_email_on_failed_scheduled_queries', e.target.checked)}
>Email query owners when scheduled queries fail
</Checkbox>
</Form.Item>
<Form.Item label="Feature Flags">
<Checkbox
name="feature_show_permissions_control"
Expand Down
3 changes: 3 additions & 0 deletions redash/settings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,9 @@ def email_server_is_configured():

HOST = os.environ.get('REDASH_HOST', '')

SEND_FAILURE_EMAIL_INTERVAL = int(os.environ.get('REDASH_SEND_FAILURE_EMAIL_INTERVAL', 60))
MAX_FAILURE_REPORTS_PER_QUERY = int(os.environ.get('REDASH_MAX_FAILURE_REPORTS_PER_QUERY', 100))

ALERTS_DEFAULT_MAIL_SUBJECT_TEMPLATE = os.environ.get('REDASH_ALERTS_DEFAULT_MAIL_SUBJECT_TEMPLATE', "({state}) {alert_name}")

# How many requests are allowed per IP to the login page before
Expand Down
3 changes: 3 additions & 0 deletions redash/settings/organization.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
JWT_AUTH_HEADER_NAME = os.environ.get("REDASH_JWT_AUTH_HEADER_NAME", "")

FEATURE_SHOW_PERMISSIONS_CONTROL = parse_boolean(os.environ.get("REDASH_FEATURE_SHOW_PERMISSIONS_CONTROL", "false"))
SEND_EMAIL_ON_FAILED_SCHEDULED_QUERIES = parse_boolean(
os.environ.get('REDASH_SEND_EMAIL_ON_FAILED_SCHEDULED_QUERIES', 'false'))

settings = {
"auth_password_login_enabled": PASSWORD_LOGIN_ENABLED,
Expand All @@ -51,4 +53,5 @@
"auth_jwt_auth_cookie_name": JWT_AUTH_COOKIE_NAME,
"auth_jwt_auth_header_name": JWT_AUTH_HEADER_NAME,
"feature_show_permissions_control": FEATURE_SHOW_PERMISSIONS_CONTROL,
"send_email_on_failed_scheduled_queries": SEND_EMAIL_ON_FAILED_SCHEDULED_QUERIES
}
1 change: 1 addition & 0 deletions redash/tasks/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .general import record_event, version_check, send_mail, sync_user_details
from .queries import QueryTask, refresh_queries, refresh_schemas, cleanup_query_results, execute_query, empty_schedules
from .alerts import check_alerts_for_query
from .failure_report import notify_of_failure
72 changes: 72 additions & 0 deletions redash/tasks/failure_report.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import datetime
import re
from collections import Counter
from flask import render_template
from redash.tasks.general import send_mail
from redash.worker import celery
from redash import redis_connection, settings, models
from redash.utils import json_dumps, json_loads, base_url


def key(user_id):
return 'aggregated_failures:{}'.format(user_id)


def comment_for(failure):
schedule_failures = failure.get('schedule_failures')
if schedule_failures > settings.MAX_FAILURE_REPORTS_PER_QUERY * 0.75:
return """NOTICE: This query has failed a total of {schedule_failures} times.
Reporting may stop when the query exceeds {max_failure_reports} overall failures.""".format(
schedule_failures=schedule_failures,
max_failure_reports=settings.MAX_FAILURE_REPORTS_PER_QUERY
)


@celery.task(name="redash.tasks.send_aggregated_errors")
def send_aggregated_errors():
for key in redis_connection.scan_iter(key("*")):
user_id = re.search(r'\d+', key).group()
send_failure_report(user_id)


def send_failure_report(user_id):
user = models.User.get_by_id(user_id)
errors = [json_loads(e) for e in redis_connection.lrange(key(user_id), 0, -1)]

if errors:
errors.reverse()
occurrences = Counter((e.get('id'), e.get('message')) for e in errors)
unique_errors = {(e.get('id'), e.get('message')): e for e in errors}

context = {
'failures': [{
'id': v.get('id'),
'name': v.get('name'),
'failed_at': v.get('failed_at'),
'failure_reason': v.get('message'),
'failure_count': occurrences[k],
'comment': comment_for(v)
} for k, v in unique_errors.iteritems()],
'base_url': base_url(user.org)
}

html = render_template('emails/failures.html', **context)
text = render_template('emails/failures.txt', **context)
subject = "Redash failed to execute {} of your scheduled queries".format(len(unique_errors.keys()))
send_mail.delay([user.email], subject, html, text)

redis_connection.delete(key(user_id))


def notify_of_failure(message, query):
subscribed = query.org.get_setting('send_email_on_failed_scheduled_queries')
exceeded_threshold = query.schedule_failures >= settings.MAX_FAILURE_REPORTS_PER_QUERY

if subscribed and not query.user.is_disabled and not exceeded_threshold:
redis_connection.lpush(key(query.user.id), json_dumps({
'id': query.id,
'name': query.name,
'message': message,
'schedule_failures': query.schedule_failures,
'failed_at': datetime.datetime.utcnow().strftime("%B %d, %Y %I:%M%p UTC")
}))
15 changes: 11 additions & 4 deletions redash/tasks/queries.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import logging
import signal
import time

import redis
from celery.exceptions import SoftTimeLimitExceeded, TimeLimitExceeded
from celery.result import AsyncResult
Expand All @@ -11,10 +10,12 @@
from redash import models, redis_connection, settings, statsd_client
from redash.query_runner import InterruptException
from redash.tasks.alerts import check_alerts_for_query
from redash.tasks.failure_report import notify_of_failure
from redash.utils import gen_query_hash, json_dumps, utcnow, mustache_render
from redash.worker import celery

logger = get_task_logger(__name__)
TIMEOUT_MESSAGE = "Query exceeded Redash query execution time limit."


def _job_lock_id(query_hash, data_source_id):
Expand Down Expand Up @@ -56,7 +57,7 @@ def to_dict(self):
status = self.STATUSES[task_status]

if isinstance(result, (TimeLimitExceeded, SoftTimeLimitExceeded)):
error = "Query exceeded Redash query execution time limit."
error = TIMEOUT_MESSAGE
status = 4
elif isinstance(result, Exception):
error = result.message
Expand Down Expand Up @@ -142,7 +143,7 @@ def enqueue_query(query, data_source, user_id, is_api_key=False, scheduled_query
result = execute_query.apply_async(args=args,
argsrepr=argsrepr,
queue=queue_name,
time_limit=time_limit)
soft_time_limit=time_limit)

job = QueryTask(async_result=result)
logging.info("[%s] Created new job: %s", query_hash, job.id)
Expand Down Expand Up @@ -338,7 +339,11 @@ def run(self):
try:
data, error = query_runner.run_query(annotated_query, self.user)
except Exception as e:
error = text_type(e)
if isinstance(e, SoftTimeLimitExceeded):
error = TIMEOUT_MESSAGE
else:
error = text_type(e)

data = None
logging.warning('Unexpected error while running query:', exc_info=1)

Expand All @@ -354,6 +359,7 @@ def run(self):
self.scheduled_query = models.db.session.merge(self.scheduled_query, load=False)
self.scheduled_query.schedule_failures += 1
models.db.session.add(self.scheduled_query)
notify_of_failure(error, self.scheduled_query)
models.db.session.commit()
raise result
else:
Expand Down Expand Up @@ -411,5 +417,6 @@ def execute_query(self, query, data_source_id, metadata, user_id=None,
scheduled_query = models.Query.query.get(scheduled_query_id)
else:
scheduled_query = None

return QueryExecutor(self, query, data_source_id, user_id, is_api_key, metadata,
scheduled_query).run()
41 changes: 41 additions & 0 deletions redash/templates/emails/failures.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
<html>

<head>
<style>
body {
margin: 0;
}
</style>
</head>

<body style="margin: 0;">
<div
style="background: #f6f8f9; font-family: arial; font-size: 14px; padding: 20px; color: #333; line-height: 20px">
<h1 style="font-size: 14px; font-weight: normal;">Redash failed to run the following queries:</h1>

{% for failure in failures %}
<div style="background: white;padding: 5px 20px 20px 20px; border-radius: 3px; box-shadow: 0 4px 9px -3px rgba(102,136,153,.15); margin-top: 20px">
<h2 style="font-size: 16px; position: relative; padding-right: 90px;">{{failure.name}} <a href="{{base_url}}/queries/{{failure.id}}" style="background-color: #ff7964;font-size: 13px;color: white;text-decoration: none;padding: 0 5px;display: inline-block;border-radius: 3px;font-weight: normal;position: absolute;right: 0;top:-1px">Go
to Query</a></h2>
<div>
<p>
Last failed: {{failure.failed_at}}
{% if failure.failure_count > 1 %}
<br />
<small style="color: #8c8c8c;font-size: 13px;">&nbsp; + {{failure.failure_count - 1}} more failures since last report</small>
{% endif %}
</p>
<h3 style="font-size: 14px; font-weight:normal; margin-bottom: 6px">Exception</h3>
<div style="background: #ececec;padding: 8px;border-radius: 3px;font-family: monospace; word-break: break-all;">{{failure.failure_reason}}</div>
</div>

{% if failure.comment %}
<div style="margin-top: 25px;font-size: 13px;border-top: 1px solid #ececec;padding-top: 19px;">{{failure.comment}}</div>
{% endif %}
</div>
{% endfor %}

</div>
</body>

</html>
15 changes: 15 additions & 0 deletions redash/templates/emails/failures.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
Redash failed to run the following queries:

{% for failure in failures %}

{{failure.name}} ({{base_url}}/queries/{{failure.id}})
Last failed: {{failure.failed_at}}{% if failure.failure_count > 1 %} + {{failure.failure_count - 1}} more failures since last report{% endif %}
Exception:

{{failure.failure_reason}}

{% if failure.comment %}
{{failure.comment}}
{% endif %}

{% endfor %}
4 changes: 4 additions & 0 deletions redash/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@
'sync_user_details': {
'task': 'redash.tasks.sync_user_details',
'schedule': timedelta(minutes=1),
},
'send_aggregated_errors': {
'task': 'redash.tasks.send_aggregated_errors',
'schedule': timedelta(minutes=settings.SEND_FAILURE_EMAIL_INTERVAL),
}
}

Expand Down
99 changes: 99 additions & 0 deletions tests/tasks/test_failure_report.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
from unittest import TestCase

import mock
from freezegun import freeze_time
import dateutil

from tests import BaseTestCase
from redash import redis_connection, models, settings
from redash.tasks.failure_report import notify_of_failure, send_failure_report, send_aggregated_errors, key
from redash.utils import json_loads

class TestSendAggregatedErrorsTask(BaseTestCase):
def setUp(self):
super(TestSendAggregatedErrorsTask, self).setUp()
redis_connection.flushall()
self.factory.org.set_setting('send_email_on_failed_scheduled_queries', True)

def notify(self, message="Oh no, I failed!", query=None, **kwargs):
if query is None:
query = self.factory.create_query(**kwargs)

notify_of_failure(message, query)
return key(query.user.id)

@mock.patch('redash.tasks.failure_report.render_template')
def send_email(self, user, render_template):
send_failure_report(user.id)

_, context = render_template.call_args
return context['failures']

def test_schedules_email_if_failure_count_is_beneath_limit(self):
key = self.notify(schedule_failures=settings.MAX_FAILURE_REPORTS_PER_QUERY - 1)
email_pending = redis_connection.exists(key)
self.assertTrue(email_pending)

def test_does_not_report_if_failure_count_is_beyond_limit(self):
key = self.notify(schedule_failures=settings.MAX_FAILURE_REPORTS_PER_QUERY)
email_pending = redis_connection.exists(key)
self.assertFalse(email_pending)

def test_does_not_report_if_organization_is_not_subscribed(self):
self.factory.org.set_setting('send_email_on_failed_scheduled_queries', False)
key = self.notify()
email_pending = redis_connection.exists(key)
self.assertFalse(email_pending)

def test_does_not_report_if_query_owner_is_disabled(self):
self.factory.user.disable()
key = self.notify()
email_pending = redis_connection.exists(key)
self.assertFalse(email_pending)

def test_does_not_indicate_when_not_near_limit_for_a_query(self):
self.notify(schedule_failures=settings.MAX_FAILURE_REPORTS_PER_QUERY / 2)
failures = self.send_email(self.factory.user)

self.assertFalse(failures[0]['comment'])

def test_indicates_when_near_limit_for_a_query(self):
self.notify(schedule_failures=settings.MAX_FAILURE_REPORTS_PER_QUERY - 1)
failures = self.send_email(self.factory.user)

self.assertTrue(failures[0]['comment'])

def test_aggregates_different_queries_in_a_single_report(self):
key1 = self.notify(message="I'm a failure")
key2 = self.notify(message="I'm simply not a success")

self.assertEqual(key1, key2)

def test_counts_failures_for_each_reason(self):
query = self.factory.create_query()

self.notify(message="I'm a failure", query=query)
self.notify(message="I'm a failure", query=query)
self.notify(message="I'm a different type of failure", query=query)
self.notify(message="I'm a totally different query")

failures = self.send_email(query.user)

f1 = next(f for f in failures if f["failure_reason"] == "I'm a failure")
self.assertEqual(2, f1['failure_count'])
f2 = next(f for f in failures if f["failure_reason"] == "I'm a different type of failure")
self.assertEqual(1, f2['failure_count'])
f3 = next(f for f in failures if f["failure_reason"] == "I'm a totally different query")
self.assertEqual(1, f3['failure_count'])

def test_shows_latest_failure_time(self):
query = self.factory.create_query()

with freeze_time("2000-01-01"):
self.notify(query=query)

self.notify(query=query)

failures = self.send_email(query.user)
latest_failure = dateutil.parser.parse(failures[0]['failed_at'])
self.assertNotEqual(2000, latest_failure.year)
2 changes: 1 addition & 1 deletion tests/tasks/test_queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def test_limits_query_time(self, _):
enqueue_query(query.query_text, query.data_source, query.user_id, False, query, {'Username': 'Arik', 'Query ID': query.id})

_, kwargs = execute_query.apply_async.call_args
self.assertEqual(60, kwargs.get('time_limit'))
self.assertEqual(60, kwargs.get('soft_time_limit'))

def test_multiple_enqueue_of_different_query(self):
query = self.factory.create_query()
Expand Down

0 comments on commit 7466fd2

Please sign in to comment.