From a102d90a34a410cfb78dab795fbf4fb83cf58443 Mon Sep 17 00:00:00 2001 From: Jordan Conway Date: Wed, 30 Jul 2025 09:42:12 -0400 Subject: [PATCH 1/2] Add notifications to the high number of messages alert Signed-off-by: Jordan Conway --- monitors.tf | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/monitors.tf b/monitors.tf index 71c8ac8..9e4436a 100644 --- a/monitors.tf +++ b/monitors.tf @@ -9,6 +9,8 @@ resource "datadog_monitor" "ci_retry_deadletter" { Verify that system is able to scale up EC2 instances by checking logs. @webhook-lf-incident-io + @slack-PyTorch-pytorch-infra-alerts + @slack-Linux_Foundation-pytorch-alerts MSG type = "query alert" @@ -51,5 +53,8 @@ anomalies( EOT message = < Date: Wed, 30 Jul 2025 09:44:44 -0400 Subject: [PATCH 2/2] Fix typo in incident.io webhool Signed-off-by: Jordan Conway --- integrations.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations.tf b/integrations.tf index edc37d9..593a15c 100644 --- a/integrations.tf +++ b/integrations.tf @@ -45,7 +45,7 @@ resource "datadog_integration_slack_channel" "pytorch-infra-alerts" { # Create a new Datadog webhook resource "datadog_webhook" "lf-incident-io" { - name = "lf-inceident-io" + name = "lf-incident-io" url = "https://api.incident.io/v2/alert_events/datadog/01JKTRSFTE6H2SR4AFM4VGWZFN" encode_as = "json"