diff --git a/common/common.tf b/common/common.tf index 84135b5..9d69640 100644 --- a/common/common.tf +++ b/common/common.tf @@ -179,6 +179,12 @@ variable "notify_prod_override" { type = list(string) } +variable "group_by" { + default = ["name", "aws_account", "env", "datadog_managed"] + description = "List of tags to group by" + type = list(string) +} + locals { # tag related locals @@ -336,4 +342,7 @@ ${local.alert_context} {{#is_alert}} ${local.notify_on_alert} {{/is_alert}} {{#is_recovery}} ${local.notify_on_recovery} {{/is_recovery}} END + + service_group_by = join(",", formatlist("\"%s\"", var.group_by)) + query_group_by = join(",", var.group_by) } diff --git a/host/agent/README.md b/host/agent/README.md index b71562a..f3af365 100644 --- a/host/agent/README.md +++ b/host/agent/README.md @@ -35,22 +35,27 @@ No modules. | [alert\_critical\_priority](#input\_alert\_critical\_priority) | Priority for alerts within critical threshold (P1-P5, uses monitor defaults if not specified) | `string` | `null` | no | | [alert\_message](#input\_alert\_message) | Message to prepend to alert notifications | `string` | `"Alert"` | no | | [alert\_nodata\_priority](#input\_alert\_nodata\_priority) | Priority for alerts within warning threshold (P1-P5, uses monitor defaults if not specified) | `string` | `null` | no | -| [base\_tags](#input\_base\_tags) | Base tags (key:value format) to add to this type of check (combined with `local.tags` and `var.additional_tags`, generally you should not change this) | `list(string)` |
[| no | +| [base\_tags](#input\_base\_tags) | Base tags (key:value format) to add to this type of check (combined with `local.tags` and `var.additional_tags`, generally you should not change this) | `list(string)` |
"resource:apigateway"
]
[| no | | [cost\_center](#input\_cost\_center) | Cost Center of the monitored resource (leave blank to omit tag) | `string` | `null` | no | | [dashboard\_link](#input\_dashboard\_link) | Dashboard link to include in message | `string` | `null` | no | -| [env](#input\_env) | Environment the monitored resource is in (leave blank to omit tag) | `string` | n/a | yes | +| [env](#input\_env) | Environment the monitored resource is in (leave blank to omit tag) | `string` | `null` | no | | [evaluation\_delay](#input\_evaluation\_delay) | Monitor evaluation delay (see [https://docs.datadoghq.com/monitors/configuration/?tab=thresholdalert#set-alert-conditions](Datadog Docs)) | `number` | `900` | no | +| [group\_by](#input\_group\_by) | List of tags to group by | `list(string)` |
"resource:ec2"
]
[| no | | [host\_unreachable\_enabled](#input\_host\_unreachable\_enabled) | Flag to enable Host unreachable monitor | `bool` | `true` | no | +| [host\_unreachable\_use\_message](#input\_host\_unreachable\_use\_message) | Flag to enable Host unreachable alerting | `bool` | `true` | no | | [monitor\_exclude\_tags](#input\_monitor\_exclude\_tags) | Tags to be excluded in the monitoring query. Specify in key:value format | `list(string)` | `[]` | no | | [monitor\_include\_tags](#input\_monitor\_include\_tags) | Tags to be included in the monitoring query. Specify in key:value format | `list(string)` | `[]` | no | | [new\_group\_delay](#input\_new\_group\_delay) | Delay in seconds before generating alerts for a new resource | `number` | `300` | no | | [notify\_alert\_override](#input\_notify\_alert\_override) | List of notifications for alerts in critical threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_crit\_override](#input\_notify\_crit\_override) | List of notifications for 24x7 alerts in critical threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | | [notify\_default](#input\_notify\_default) | List of alert notifications (can be overridden based on alert type) | `list(string)` | n/a | yes | | [notify\_no\_data](#input\_notify\_no\_data) | Alert if no matching data is found | `bool` | `false` | no | | [notify\_nodata\_override](#input\_notify\_nodata\_override) | List of notifications for no data (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_nonprod\_override](#input\_notify\_nonprod\_override) | List of notifications for non-prod alerts in critical threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_prod\_override](#input\_notify\_prod\_override) | List of notifications for 12x5 prod alerts in critical threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | | [notify\_recovery\_override](#input\_notify\_recovery\_override) | List of notifications for alert recovery (uses `notify_default` otherwise) | `list(string)` | `[]` | no | | [notify\_warn\_override](#input\_notify\_warn\_override) | List of notifications for alerts in warning threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | -| [renotify\_interval](#input\_renotify\_interval) | Interval in minutes to re-send notifications about an alert | `number` | `0` | no | +| [renotify\_interval](#input\_renotify\_interval) | Interval in minutes to re-send notifications about an alert | `number` | `60` | no | | [runbook\_link](#input\_runbook\_link) | Runbook link to include in message | `string` | `null` | no | | [service](#input\_service) | Service associated with the monitored resource (leave blank to omit tag) | `string` | `null` | no | | [team](#input\_team) | Team supporting the monitored resource (leave blank to omit tag) | `string` | `null` | no | diff --git a/host/agent/main.tf b/host/agent/main.tf index dc54b4d..1d28ecc 100644 --- a/host/agent/main.tf +++ b/host/agent/main.tf @@ -11,22 +11,22 @@ locals { resource "datadog_monitor" "host_unreachable" { count = var.host_unreachable_enabled ? 1 : 0 - name = join("", [local.title_prefix, "Datadog Agent Status - {{name.name}}", local.title_suffix]) + name = join("", [local.title_prefix, "Datadog Agent Status - {{name.name}}", local.title_suffix]) include_tags = false message = var.host_unreachable_use_message ? local.query_alert_base_message : "" - tags = concat(local.common_tags, var.base_tags, var.additional_tags) - type = "service check" + tags = concat(local.common_tags, var.base_tags, var.additional_tags) + type = "service check" evaluation_delay = var.evaluation_delay new_group_delay = var.new_group_delay - no_data_timeframe = "5" + no_data_timeframe = "5" notify_no_data = true renotify_interval = var.renotify_interval require_full_window = true timeout_h = var.timeout_h query = <
"name",
"aws_account",
"env",
"datadog_managed"
]
[| no | +| [base\_tags](#input\_base\_tags) | Base tags (key:value format) to add to this type of check (combined with `local.tags` and `var.additional_tags`, generally you should not change this) | `list(string)` |
"resource:apigateway"
]
[| no | | [cost\_center](#input\_cost\_center) | Cost Center of the monitored resource (leave blank to omit tag) | `string` | `null` | no | | [dashboard\_link](#input\_dashboard\_link) | Dashboard link to include in message | `string` | `null` | no | -| [env](#input\_env) | Environment the monitored resource is in (leave blank to omit tag) | `string` | n/a | yes | +| [env](#input\_env) | Environment the monitored resource is in (leave blank to omit tag) | `string` | `null` | no | | [evaluation\_delay](#input\_evaluation\_delay) | Monitor evaluation delay (see [https://docs.datadoghq.com/monitors/configuration/?tab=thresholdalert#set-alert-conditions](Datadog Docs)) | `number` | `900` | no | +| [group\_by](#input\_group\_by) | List of tags to group by | `list(string)` |
"resource:ec2"
]
[| no | | [monitor\_exclude\_tags](#input\_monitor\_exclude\_tags) | Tags to be excluded in the monitoring query. Specify in key:value format | `list(string)` | `[]` | no | | [monitor\_include\_tags](#input\_monitor\_include\_tags) | Tags to be included in the monitoring query. Specify in key:value format | `list(string)` | `[]` | no | | [new\_group\_delay](#input\_new\_group\_delay) | Delay in seconds before generating alerts for a new resource | `number` | `300` | no | | [notify\_alert\_override](#input\_notify\_alert\_override) | List of notifications for alerts in critical threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_crit\_override](#input\_notify\_crit\_override) | List of notifications for 24x7 alerts in critical threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | | [notify\_default](#input\_notify\_default) | List of alert notifications (can be overridden based on alert type) | `list(string)` | n/a | yes | | [notify\_no\_data](#input\_notify\_no\_data) | Alert if no matching data is found | `bool` | `false` | no | | [notify\_nodata\_override](#input\_notify\_nodata\_override) | List of notifications for no data (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_nonprod\_override](#input\_notify\_nonprod\_override) | List of notifications for non-prod alerts in critical threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_prod\_override](#input\_notify\_prod\_override) | List of notifications for 12x5 prod alerts in critical threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | | [notify\_recovery\_override](#input\_notify\_recovery\_override) | List of notifications for alert recovery (uses `notify_default` otherwise) | `list(string)` | `[]` | no | | [notify\_warn\_override](#input\_notify\_warn\_override) | List of notifications for alerts in warning threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | -| [renotify\_interval](#input\_renotify\_interval) | Interval in minutes to re-send notifications about an alert | `number` | `0` | no | +| [renotify\_interval](#input\_renotify\_interval) | Interval in minutes to re-send notifications about an alert | `number` | `60` | no | | [runbook\_link](#input\_runbook\_link) | Runbook link to include in message | `string` | `null` | no | | [service](#input\_service) | Service associated with the monitored resource (leave blank to omit tag) | `string` | `null` | no | | [system\_clock\_enabled](#input\_system\_clock\_enabled) | Flag to enable Host unreachable monitor | `bool` | `true` | no | +| [system\_clock\_use\_message](#input\_system\_clock\_use\_message) | Flag to enable Host unreachable alerting | `bool` | `false` | no | | [team](#input\_team) | Team supporting the monitored resource (leave blank to omit tag) | `string` | `null` | no | | [timeout\_h](#input\_timeout\_h) | Auto-resolve alert in specified hours if condition no longer matches | `number` | `0` | no | | [title\_prefix](#input\_title\_prefix) | Prefix all alerts with specified value in brackets | `string` | `null` | no | diff --git a/host/clock/main.tf b/host/clock/main.tf index 94ce70c..be6e892 100644 --- a/host/clock/main.tf +++ b/host/clock/main.tf @@ -11,11 +11,11 @@ locals { resource "datadog_monitor" "system_clock" { count = var.system_clock_enabled ? 1 : 0 - name = join("", [local.title_prefix, "System Clock - {{name.name}}", local.title_suffix]) + name = join("", [local.title_prefix, "System Clock - {{name.name}}", local.title_suffix]) include_tags = false message = var.system_clock_use_message ? local.query_alert_base_message : "" - tags = concat(local.common_tags, var.base_tags, var.additional_tags) - type = "service check" + tags = concat(local.common_tags, var.base_tags, var.additional_tags) + type = "service check" evaluation_delay = var.evaluation_delay new_group_delay = var.new_group_delay @@ -25,7 +25,7 @@ resource "datadog_monitor" "system_clock" { timeout_h = var.timeout_h query = <
"name",
"aws_account",
"env",
"datadog_managed"
]
[| no | +| [base\_tags](#input\_base\_tags) | Base tags (key:value format) to add to this type of check (combined with `local.tags` and `var.additional_tags`, generally you should not change this) | `list(string)` |
"resource:apigateway"
]
[| no | | [cost\_center](#input\_cost\_center) | Cost Center of the monitored resource (leave blank to omit tag) | `string` | `null` | no | | [cpu\_utilization\_enabled](#input\_cpu\_utilization\_enabled) | Flag to enable CPU Utilitzation monitor | `bool` | `true` | no | | [cpu\_utilization\_no\_data\_window](#input\_cpu\_utilization\_no\_data\_window) | No data threshold (in minutes, 0 to disable) | `number` | `10` | no | @@ -43,19 +43,24 @@ No modules. | [cpu\_utilization\_threshold\_warning](#input\_cpu\_utilization\_threshold\_warning) | Warning threshold (percent) | `number` | `80` | no | | [cpu\_utilization\_time\_aggregator](#input\_cpu\_utilization\_time\_aggregator) | Monitor aggregator for CPU high [available values: min, max or avg] | `string` | `"min"` | no | | [cpu\_utilization\_timeframe](#input\_cpu\_utilization\_timeframe) | Monitor timeframe for CPU high [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | `string` | `"last_1h"` | no | +| [cpu\_utilization\_use\_message](#input\_cpu\_utilization\_use\_message) | Flag to enable CPU Utilitzation alerting | `bool` | `false` | no | | [dashboard\_link](#input\_dashboard\_link) | Dashboard link to include in message | `string` | `null` | no | -| [env](#input\_env) | Environment the monitored resource is in (leave blank to omit tag) | `string` | n/a | yes | +| [env](#input\_env) | Environment the monitored resource is in (leave blank to omit tag) | `string` | `null` | no | | [evaluation\_delay](#input\_evaluation\_delay) | Monitor evaluation delay (see [https://docs.datadoghq.com/monitors/configuration/?tab=thresholdalert#set-alert-conditions](Datadog Docs)) | `number` | `900` | no | +| [group\_by](#input\_group\_by) | List of tags to group by | `list(string)` |
"resource:ec2"
]
[| no | | [monitor\_exclude\_tags](#input\_monitor\_exclude\_tags) | Tags to be excluded in the monitoring query. Specify in key:value format | `list(string)` | `[]` | no | | [monitor\_include\_tags](#input\_monitor\_include\_tags) | Tags to be included in the monitoring query. Specify in key:value format | `list(string)` | `[]` | no | | [new\_group\_delay](#input\_new\_group\_delay) | Delay in seconds before generating alerts for a new resource | `number` | `300` | no | | [notify\_alert\_override](#input\_notify\_alert\_override) | List of notifications for alerts in critical threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_crit\_override](#input\_notify\_crit\_override) | List of notifications for 24x7 alerts in critical threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | | [notify\_default](#input\_notify\_default) | List of alert notifications (can be overridden based on alert type) | `list(string)` | n/a | yes | | [notify\_no\_data](#input\_notify\_no\_data) | Alert if no matching data is found | `bool` | `false` | no | | [notify\_nodata\_override](#input\_notify\_nodata\_override) | List of notifications for no data (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_nonprod\_override](#input\_notify\_nonprod\_override) | List of notifications for non-prod alerts in critical threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_prod\_override](#input\_notify\_prod\_override) | List of notifications for 12x5 prod alerts in critical threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | | [notify\_recovery\_override](#input\_notify\_recovery\_override) | List of notifications for alert recovery (uses `notify_default` otherwise) | `list(string)` | `[]` | no | | [notify\_warn\_override](#input\_notify\_warn\_override) | List of notifications for alerts in warning threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | -| [renotify\_interval](#input\_renotify\_interval) | Interval in minutes to re-send notifications about an alert | `number` | `0` | no | +| [renotify\_interval](#input\_renotify\_interval) | Interval in minutes to re-send notifications about an alert | `number` | `60` | no | | [runbook\_link](#input\_runbook\_link) | Runbook link to include in message | `string` | `null` | no | | [service](#input\_service) | Service associated with the monitored resource (leave blank to omit tag) | `string` | `null` | no | | [team](#input\_team) | Team supporting the monitored resource (leave blank to omit tag) | `string` | `null` | no | diff --git a/host/cpu/main.tf b/host/cpu/main.tf index 88fc47e..faa5e9a 100644 --- a/host/cpu/main.tf +++ b/host/cpu/main.tf @@ -12,7 +12,7 @@ resource "datadog_monitor" "cpu_utilization" { count = var.cpu_utilization_enabled ? 1 : 0 name = join("", [local.title_prefix, "CPU Utilization - {{name.name}}", local.title_suffix]) - message = var.cpu_utilization_use_message ? local.query_alert_base_message : "" + message = var.cpu_utilization_use_message ? local.query_alert_base_message : "" tags = concat(local.common_tags, var.base_tags, var.additional_tags) type = "query alert" @@ -23,12 +23,12 @@ resource "datadog_monitor" "cpu_utilization" { renotify_interval = var.renotify_interval require_full_window = true timeout_h = var.timeout_h - include_tags = false + include_tags = false query = <
"name",
"aws_account",
"env",
"datadog_managed"
]
[| no | +| [base\_tags](#input\_base\_tags) | Base tags (key:value format) to add to this type of check (combined with `local.tags` and `var.additional_tags`, generally you should not change this) | `list(string)` |
"resource:apigateway"
]
[| no | | [cost\_center](#input\_cost\_center) | Cost Center of the monitored resource (leave blank to omit tag) | `string` | `null` | no | | [dashboard\_link](#input\_dashboard\_link) | Dashboard link to include in message | `string` | `null` | no | | [disk\_inodes\_enabled](#input\_disk\_inodes\_enabled) | Flag to enable Free disk inodes monitor | `string` | `"true"` | no | @@ -45,6 +45,7 @@ No modules. | [disk\_inodes\_threshold\_warning](#input\_disk\_inodes\_threshold\_warning) | Free disk space warning threshold | `number` | `90` | no | | [disk\_inodes\_time\_aggregator](#input\_disk\_inodes\_time\_aggregator) | Monitor aggregator for Free disk inodes [available values: min, max or avg] | `string` | `"min"` | no | | [disk\_inodes\_timeframe](#input\_disk\_inodes\_timeframe) | Monitor timeframe for Free disk inodes [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | `string` | `"last_5m"` | no | +| [disk\_inodes\_use\_message](#input\_disk\_inodes\_use\_message) | Flag to enable Free disk inodes alerting | `string` | `"true"` | no | | [disk\_space\_enabled](#input\_disk\_space\_enabled) | Flag to enable Free diskspace monitor | `string` | `"true"` | no | | [disk\_space\_forecast\_algorithm](#input\_disk\_space\_forecast\_algorithm) | Algorithm for the Free diskspace Forecast monitor [available values: `linear` or `seasonal`] | `string` | `"linear"` | no | | [disk\_space\_forecast\_deviations](#input\_disk\_space\_forecast\_deviations) | Deviations for the Free diskspace Forecast monitor [available values: `1`, `2`, `3`, `4` or `5`] | `string` | `1` | no | @@ -57,22 +58,28 @@ No modules. | [disk\_space\_forecast\_threshold\_critical\_recovery](#input\_disk\_space\_forecast\_threshold\_critical\_recovery) | Free disk space forecast recovery threshold | `number` | `72` | no | | [disk\_space\_forecast\_time\_aggregator](#input\_disk\_space\_forecast\_time\_aggregator) | Monitor aggregator for Free diskspace forecast [available values: min, max or avg] | `string` | `"max"` | no | | [disk\_space\_forecast\_timeframe](#input\_disk\_space\_forecast\_timeframe) | Monitor timeframe for Free diskspace forecast [available values: `next_12h`, `next_#d` (1, 2, or 3), `next_#w` (1 or 2) or `next_#mo` (1, 2 or 3)] | `string` | `"next_1w"` | no | +| [disk\_space\_forecast\_use\_message](#input\_disk\_space\_forecast\_use\_message) | Flag to enable Free diskspace forecast alerting | `string` | `"false"` | no | | [disk\_space\_threshold\_critical](#input\_disk\_space\_threshold\_critical) | Free disk space critical threshold | `number` | `90` | no | | [disk\_space\_threshold\_warning](#input\_disk\_space\_threshold\_warning) | Free disk space warning threshold | `number` | `80` | no | | [disk\_space\_time\_aggregator](#input\_disk\_space\_time\_aggregator) | Monitor aggregator for Free diskspace [available values: min, max or avg] | `string` | `"max"` | no | | [disk\_space\_timeframe](#input\_disk\_space\_timeframe) | Monitor timeframe for Free diskspace [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | `string` | `"last_5m"` | no | -| [env](#input\_env) | Environment the monitored resource is in (leave blank to omit tag) | `string` | n/a | yes | +| [disk\_space\_use\_message](#input\_disk\_space\_use\_message) | Flag to enable Free diskspace alerting | `string` | `"true"` | no | +| [env](#input\_env) | Environment the monitored resource is in (leave blank to omit tag) | `string` | `null` | no | | [evaluation\_delay](#input\_evaluation\_delay) | Monitor evaluation delay (see [https://docs.datadoghq.com/monitors/configuration/?tab=thresholdalert#set-alert-conditions](Datadog Docs)) | `number` | `900` | no | +| [group\_by](#input\_group\_by) | List of tags to group by | `list(string)` |
"resource:ec2"
]
[| no | | [monitor\_exclude\_tags](#input\_monitor\_exclude\_tags) | Tags to be excluded in the monitoring query. Specify in key:value format | `list(string)` | `[]` | no | | [monitor\_include\_tags](#input\_monitor\_include\_tags) | Tags to be included in the monitoring query. Specify in key:value format | `list(string)` | `[]` | no | | [new\_group\_delay](#input\_new\_group\_delay) | Delay in seconds before generating alerts for a new resource | `number` | `300` | no | | [notify\_alert\_override](#input\_notify\_alert\_override) | List of notifications for alerts in critical threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_crit\_override](#input\_notify\_crit\_override) | List of notifications for 24x7 alerts in critical threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | | [notify\_default](#input\_notify\_default) | List of alert notifications (can be overridden based on alert type) | `list(string)` | n/a | yes | | [notify\_no\_data](#input\_notify\_no\_data) | Alert if no matching data is found | `bool` | `false` | no | | [notify\_nodata\_override](#input\_notify\_nodata\_override) | List of notifications for no data (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_nonprod\_override](#input\_notify\_nonprod\_override) | List of notifications for non-prod alerts in critical threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_prod\_override](#input\_notify\_prod\_override) | List of notifications for 12x5 prod alerts in critical threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | | [notify\_recovery\_override](#input\_notify\_recovery\_override) | List of notifications for alert recovery (uses `notify_default` otherwise) | `list(string)` | `[]` | no | | [notify\_warn\_override](#input\_notify\_warn\_override) | List of notifications for alerts in warning threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | -| [renotify\_interval](#input\_renotify\_interval) | Interval in minutes to re-send notifications about an alert | `number` | `0` | no | +| [renotify\_interval](#input\_renotify\_interval) | Interval in minutes to re-send notifications about an alert | `number` | `60` | no | | [runbook\_link](#input\_runbook\_link) | Runbook link to include in message | `string` | `null` | no | | [service](#input\_service) | Service associated with the monitored resource (leave blank to omit tag) | `string` | `null` | no | | [team](#input\_team) | Team supporting the monitored resource (leave blank to omit tag) | `string` | `null` | no | diff --git a/host/disk/main.tf b/host/disk/main.tf index e6e4c4c..ce57f41 100644 --- a/host/disk/main.tf +++ b/host/disk/main.tf @@ -12,7 +12,7 @@ resource "datadog_monitor" "disk_space" { count = var.disk_space_enabled ? 1 : 0 name = join("", [local.title_prefix, "Disk Space - {{name.name}}", local.title_suffix]) - message = var.disk_space_use_message ? local.query_alert_base_message : "" + message = var.disk_space_use_message ? local.query_alert_base_message : "" tags = concat(local.common_tags, var.base_tags, var.additional_tags) type = "query alert" @@ -26,7 +26,7 @@ resource "datadog_monitor" "disk_space" { query = <
"name",
"aws_account",
"env",
"datadog_managed"
]
[| no | +| [base\_tags](#input\_base\_tags) | Base tags (key:value format) to add to this type of check (combined with `local.tags` and `var.additional_tags`, generally you should not change this) | `list(string)` |
"resource:apigateway"
]
[| no | | [cost\_center](#input\_cost\_center) | Cost Center of the monitored resource (leave blank to omit tag) | `string` | `null` | no | | [dashboard\_link](#input\_dashboard\_link) | Dashboard link to include in message | `string` | `null` | no | -| [env](#input\_env) | Environment the monitored resource is in (leave blank to omit tag) | `string` | n/a | yes | +| [env](#input\_env) | Environment the monitored resource is in (leave blank to omit tag) | `string` | `null` | no | | [evaluation\_delay](#input\_evaluation\_delay) | Monitor evaluation delay (see [https://docs.datadoghq.com/monitors/configuration/?tab=thresholdalert#set-alert-conditions](Datadog Docs)) | `number` | `900` | no | +| [group\_by](#input\_group\_by) | List of tags to group by | `list(string)` |
"resource:ec2"
]
[| no | | [memory\_enabled](#input\_memory\_enabled) | Flag to enable Free memory monitor | `string` | `"true"` | no | | [memory\_threshold\_critical](#input\_memory\_threshold\_critical) | Free disk space critical threshold | `number` | `5` | no | | [memory\_threshold\_warning](#input\_memory\_threshold\_warning) | Free disk space warning threshold | `number` | `10` | no | | [memory\_time\_aggregator](#input\_memory\_time\_aggregator) | Monitor aggregator for Free memory [available values: min, max or avg] | `string` | `"max"` | no | | [memory\_timeframe](#input\_memory\_timeframe) | Monitor timeframe for Free memory [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | `string` | `"last_5m"` | no | +| [memory\_use\_message](#input\_memory\_use\_message) | Flag to enable Free memory alerting | `string` | `"true"` | no | | [monitor\_exclude\_tags](#input\_monitor\_exclude\_tags) | Tags to be excluded in the monitoring query. Specify in key:value format | `list(string)` | `[]` | no | | [monitor\_include\_tags](#input\_monitor\_include\_tags) | Tags to be included in the monitoring query. Specify in key:value format | `list(string)` | `[]` | no | | [new\_group\_delay](#input\_new\_group\_delay) | Delay in seconds before generating alerts for a new resource | `number` | `300` | no | | [notify\_alert\_override](#input\_notify\_alert\_override) | List of notifications for alerts in critical threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_crit\_override](#input\_notify\_crit\_override) | List of notifications for 24x7 alerts in critical threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | | [notify\_default](#input\_notify\_default) | List of alert notifications (can be overridden based on alert type) | `list(string)` | n/a | yes | | [notify\_no\_data](#input\_notify\_no\_data) | Alert if no matching data is found | `bool` | `false` | no | | [notify\_nodata\_override](#input\_notify\_nodata\_override) | List of notifications for no data (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_nonprod\_override](#input\_notify\_nonprod\_override) | List of notifications for non-prod alerts in critical threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_prod\_override](#input\_notify\_prod\_override) | List of notifications for 12x5 prod alerts in critical threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | | [notify\_recovery\_override](#input\_notify\_recovery\_override) | List of notifications for alert recovery (uses `notify_default` otherwise) | `list(string)` | `[]` | no | | [notify\_warn\_override](#input\_notify\_warn\_override) | List of notifications for alerts in warning threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | -| [renotify\_interval](#input\_renotify\_interval) | Interval in minutes to re-send notifications about an alert | `number` | `0` | no | +| [renotify\_interval](#input\_renotify\_interval) | Interval in minutes to re-send notifications about an alert | `number` | `60` | no | | [runbook\_link](#input\_runbook\_link) | Runbook link to include in message | `string` | `null` | no | | [service](#input\_service) | Service associated with the monitored resource (leave blank to omit tag) | `string` | `null` | no | | [team](#input\_team) | Team supporting the monitored resource (leave blank to omit tag) | `string` | `null` | no | diff --git a/host/memory/main.tf b/host/memory/main.tf index d5e1ac5..4bc8ffb 100644 --- a/host/memory/main.tf +++ b/host/memory/main.tf @@ -11,16 +11,16 @@ locals { resource "datadog_monitor" "memory" { count = var.memory_enabled ? 1 : 0 - name = join("", [local.title_prefix, "Usable Memory - {{name.name}}", local.title_suffix]) + name = join("", [local.title_prefix, "Usable Memory - {{name.name}}", local.title_suffix]) include_tags = false message = var.memory_use_message ? local.query_alert_base_message : "" - tags = concat(local.common_tags, var.base_tags, var.additional_tags) - type = "query alert" + tags = concat(local.common_tags, var.base_tags, var.additional_tags) + type = "query alert" query = <
"name",
"aws_account",
"env",
"datadog_managed"
]
[| no | +| [base\_tags](#input\_base\_tags) | Base tags (key:value format) to add to this type of check (combined with `local.tags` and `var.additional_tags`, generally you should not change this) | `list(string)` |
"resource:apigateway"
]
[| no | | [cost\_center](#input\_cost\_center) | Cost Center of the monitored resource (leave blank to omit tag) | `string` | `null` | no | | [dashboard\_link](#input\_dashboard\_link) | Dashboard link to include in message | `string` | `null` | no | -| [env](#input\_env) | Environment the monitored resource is in (leave blank to omit tag) | `string` | n/a | yes | +| [env](#input\_env) | Environment the monitored resource is in (leave blank to omit tag) | `string` | `null` | no | | [evaluation\_delay](#input\_evaluation\_delay) | Monitor evaluation delay (see [https://docs.datadoghq.com/monitors/configuration/?tab=thresholdalert#set-alert-conditions](Datadog Docs)) | `number` | `900` | no | +| [group\_by](#input\_group\_by) | List of tags to group by | `list(string)` |
"resource:ec2"
]
[| no | | [monitor\_exclude\_tags](#input\_monitor\_exclude\_tags) | Tags to be excluded in the monitoring query. Specify in key:value format | `list(string)` | `[]` | no | | [monitor\_include\_tags](#input\_monitor\_include\_tags) | Tags to be included in the monitoring query. Specify in key:value format | `list(string)` | `[]` | no | | [new\_group\_delay](#input\_new\_group\_delay) | Delay in seconds before generating alerts for a new resource | `number` | `300` | no | | [notify\_alert\_override](#input\_notify\_alert\_override) | List of notifications for alerts in critical threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_crit\_override](#input\_notify\_crit\_override) | List of notifications for 24x7 alerts in critical threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | | [notify\_default](#input\_notify\_default) | List of alert notifications (can be overridden based on alert type) | `list(string)` | n/a | yes | | [notify\_no\_data](#input\_notify\_no\_data) | Alert if no matching data is found | `bool` | `false` | no | | [notify\_nodata\_override](#input\_notify\_nodata\_override) | List of notifications for no data (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_nonprod\_override](#input\_notify\_nonprod\_override) | List of notifications for non-prod alerts in critical threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_prod\_override](#input\_notify\_prod\_override) | List of notifications for 12x5 prod alerts in critical threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | | [notify\_recovery\_override](#input\_notify\_recovery\_override) | List of notifications for alert recovery (uses `notify_default` otherwise) | `list(string)` | `[]` | no | | [notify\_warn\_override](#input\_notify\_warn\_override) | List of notifications for alerts in warning threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | | [process\_alert\_enabled](#input\_process\_alert\_enabled) | Flag to enable Process Check monitor | `string` | `"true"` | no | @@ -55,7 +59,8 @@ No modules. | [process\_alert\_threshold\_critical](#input\_process\_alert\_threshold\_critical) | Process Alert critical threshold | `number` | `1` | no | | [process\_alert\_threshold\_warning](#input\_process\_alert\_threshold\_warning) | Process Alert warning threshold | `number` | `null` | no | | [process\_alert\_timeframe](#input\_process\_alert\_timeframe) | Monitor timeframe for Process Alert [available values: `#m` (1, 5, 10, 15, or 30), `#h` (1, 2, or 4), or `1d`] | `string` | `"5m"` | no | -| [renotify\_interval](#input\_renotify\_interval) | Interval in minutes to re-send notifications about an alert | `number` | `0` | no | +| [process\_alert\_use\_message](#input\_process\_alert\_use\_message) | Flag to enable Process Check alerting | `string` | `"true"` | no | +| [renotify\_interval](#input\_renotify\_interval) | Interval in minutes to re-send notifications about an alert | `number` | `60` | no | | [runbook\_link](#input\_runbook\_link) | Runbook link to include in message | `string` | `null` | no | | [service](#input\_service) | Service associated with the monitored resource (leave blank to omit tag) | `string` | `null` | no | | [team](#input\_team) | Team supporting the monitored resource (leave blank to omit tag) | `string` | `null` | no | diff --git a/host/process/main.tf b/host/process/main.tf index c0ddd76..5acbf0a 100644 --- a/host/process/main.tf +++ b/host/process/main.tf @@ -12,7 +12,7 @@ resource "datadog_monitor" "process_alert" { count = var.process_alert_enabled ? 1 : 0 name = join("", [local.title_prefix, "Process Alert - {{host.name}}", local.title_suffix]) - message = var.process_alert_use_message ? local.query_alert_base_message : "" + message = var.process_alert_use_message ? local.query_alert_base_message : "" tags = concat(local.common_tags, var.base_tags, var.additional_tags) type = "process alert" diff --git a/host/swap/README.md b/host/swap/README.md index 4632a84..5ac490b 100644 --- a/host/swap/README.md +++ b/host/swap/README.md @@ -35,21 +35,25 @@ No modules. | [alert\_critical\_priority](#input\_alert\_critical\_priority) | Priority for alerts within critical threshold (P1-P5, uses monitor defaults if not specified) | `string` | `null` | no | | [alert\_message](#input\_alert\_message) | Message to prepend to alert notifications | `string` | `"Alert"` | no | | [alert\_nodata\_priority](#input\_alert\_nodata\_priority) | Priority for alerts within warning threshold (P1-P5, uses monitor defaults if not specified) | `string` | `null` | no | -| [base\_tags](#input\_base\_tags) | Base tags (key:value format) to add to this type of check (combined with `local.tags` and `var.additional_tags`, generally you should not change this) | `list(string)` |
"name",
"aws_account",
"env",
"datadog_managed"
]
[| no | +| [base\_tags](#input\_base\_tags) | Base tags (key:value format) to add to this type of check (combined with `local.tags` and `var.additional_tags`, generally you should not change this) | `list(string)` |
"resource:apigateway"
]
[| no | | [cost\_center](#input\_cost\_center) | Cost Center of the monitored resource (leave blank to omit tag) | `string` | `null` | no | | [dashboard\_link](#input\_dashboard\_link) | Dashboard link to include in message | `string` | `null` | no | -| [env](#input\_env) | Environment the monitored resource is in (leave blank to omit tag) | `string` | n/a | yes | +| [env](#input\_env) | Environment the monitored resource is in (leave blank to omit tag) | `string` | `null` | no | | [evaluation\_delay](#input\_evaluation\_delay) | Monitor evaluation delay (see [https://docs.datadoghq.com/monitors/configuration/?tab=thresholdalert#set-alert-conditions](Datadog Docs)) | `number` | `900` | no | +| [group\_by](#input\_group\_by) | List of tags to group by | `list(string)` |
"resource:ec2"
]
[| no | | [monitor\_exclude\_tags](#input\_monitor\_exclude\_tags) | Tags to be excluded in the monitoring query. Specify in key:value format | `list(string)` | `[]` | no | | [monitor\_include\_tags](#input\_monitor\_include\_tags) | Tags to be included in the monitoring query. Specify in key:value format | `list(string)` | `[]` | no | | [new\_group\_delay](#input\_new\_group\_delay) | Delay in seconds before generating alerts for a new resource | `number` | `300` | no | | [notify\_alert\_override](#input\_notify\_alert\_override) | List of notifications for alerts in critical threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_crit\_override](#input\_notify\_crit\_override) | List of notifications for 24x7 alerts in critical threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | | [notify\_default](#input\_notify\_default) | List of alert notifications (can be overridden based on alert type) | `list(string)` | n/a | yes | | [notify\_no\_data](#input\_notify\_no\_data) | Alert if no matching data is found | `bool` | `false` | no | | [notify\_nodata\_override](#input\_notify\_nodata\_override) | List of notifications for no data (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_nonprod\_override](#input\_notify\_nonprod\_override) | List of notifications for non-prod alerts in critical threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | +| [notify\_prod\_override](#input\_notify\_prod\_override) | List of notifications for 12x5 prod alerts in critical threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | | [notify\_recovery\_override](#input\_notify\_recovery\_override) | List of notifications for alert recovery (uses `notify_default` otherwise) | `list(string)` | `[]` | no | | [notify\_warn\_override](#input\_notify\_warn\_override) | List of notifications for alerts in warning threshold (uses `notify_default` otherwise) | `list(string)` | `[]` | no | -| [renotify\_interval](#input\_renotify\_interval) | Interval in minutes to re-send notifications about an alert | `number` | `0` | no | +| [renotify\_interval](#input\_renotify\_interval) | Interval in minutes to re-send notifications about an alert | `number` | `60` | no | | [runbook\_link](#input\_runbook\_link) | Runbook link to include in message | `string` | `null` | no | | [service](#input\_service) | Service associated with the monitored resource (leave blank to omit tag) | `string` | `null` | no | | [swap\_enabled](#input\_swap\_enabled) | Flag to enable Swap monitor | `string` | `"true"` | no | @@ -57,6 +61,7 @@ No modules. | [swap\_threshold\_warning](#input\_swap\_threshold\_warning) | Free Swap warning threshold as percentage | `number` | `0.3` | no | | [swap\_time\_aggregator](#input\_swap\_time\_aggregator) | Monitor aggregator for Free Swap [available values: min, max or avg] | `string` | `"max"` | no | | [swap\_timeframe](#input\_swap\_timeframe) | Monitor timeframe for Free Swap [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | `string` | `"last_5m"` | no | +| [swap\_use\_message](#input\_swap\_use\_message) | Flag to enable Swap alerting | `string` | `"false"` | no | | [team](#input\_team) | Team supporting the monitored resource (leave blank to omit tag) | `string` | `null` | no | | [timeout\_h](#input\_timeout\_h) | Auto-resolve alert in specified hours if condition no longer matches | `number` | `0` | no | | [title\_prefix](#input\_title\_prefix) | Prefix all alerts with specified value in brackets | `string` | `null` | no | diff --git a/host/swap/main.tf b/host/swap/main.tf index 211a957..0e097e7 100644 --- a/host/swap/main.tf +++ b/host/swap/main.tf @@ -12,13 +12,13 @@ resource "datadog_monitor" "swap" { count = var.swap_enabled ? 1 : 0 name = join("", [local.title_prefix, "Usable Swap - {{name.name}}", local.title_suffix]) - message = var.swap_use_message ? local.query_alert_base_message : "" + message = var.swap_use_message ? local.query_alert_base_message : "" tags = concat(local.common_tags, var.base_tags, var.additional_tags) type = "query alert" query = <
"name",
"aws_account",
"env",
"datadog_managed"
]