diff --git a/README.md b/README.md index 37f3edc..d562225 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,9 @@ Features 7. Logging and Monitoring: Easily configure logging destinations for slow logs and engine logs, allowing you to monitor the performance and troubleshoot any issues efficiently. + 8. CloudWatch Alerts: Set up CloudWatch alarms to monitor the health and performance of your Redis cluster. Integrate these alarms with AWS Simple Notification Service (SNS) to receive real-time alerts. Use AWS Lambda functions to customize your alerting logic, and send notifications to Slack channels for immediate visibility into your Redis cluster's status. + + ## Uses Example ```hcl @@ -31,22 +34,27 @@ module "redis" { environment = "production" name = "redis" family = "redis6.x" - vpc_id = "vpc-06eb7eskaf" - subnets = ["subnet-0bfa3eskaf","subnet-0140bskaf"] - node_type = "cache.t3.small" - kms_key_arn = "arn:aws:kms:us-east-2:222222222222:key/kms_key_arn" - num_cache_nodes = 2 - engine_version = "6.x" - multi_az_enabled = false - availability_zones = 2 - automatic_failover_enabled = true - snapshot_retention_limit = 7 - at_rest_encryption_enabled = true - transit_encryption_enabled = false - notification_topic_arn = null - allowed_security_groups = [sg-0132a18skaf] - snapshot_window = "07:00-08:00" - maintenance_window = "sun:09:00-sun:10:00" + vpc_id = "vpc-06eb7eskaf" + subnets = ["subnet-0bfa3eskaf","subnet-0140bskaf"] + node_type = "cache.t3.small" + kms_key_arn = "arn:aws:kms:us-east-2:222222222222:key/kms_key_arn" + num_cache_nodes = 2 + engine_version = "6.x" + multi_az_enabled = false + availability_zones = 2 + automatic_failover_enabled = true + snapshot_retention_limit = 7 + transit_encryption_enabled = false + notification_topic_arn = null + allowed_security_groups = [sg-0132a18skaf] + snapshot_window = "07:00-08:00" + maintenance_window = "sun:09:00-sun:10:00" + cloudwatch_metric_alarms_enabled = true # For enabling basic alerting + alarm_cpu_threshold_percent = 70 + alarm_memory_threshold_bytes = "10000000" # in bytes + slack_username = "john" + slack_channel = "redis-alerts" + slack_webhook_url = "https://hooks.slack.com/services/xxxxxxxxx" } ``` @@ -79,6 +87,7 @@ Security scanning is graciously provided by Prowler. Proowler is the leading ful | Name | Version | |------|---------| +| [archive](#provider\_archive) | n/a | | [aws](#provider\_aws) | >= 4.23 | | [random](#provider\_random) | >= 3.0.0 | @@ -86,30 +95,44 @@ Security scanning is graciously provided by Prowler. Proowler is the leading ful | Name | Source | Version | |------|--------|---------| +| [cw\_sns\_slack](#module\_cw\_sns\_slack) | ./lambda | n/a | | [security\_group\_redis](#module\_security\_group\_redis) | terraform-aws-modules/security-group/aws | 4.13.0 | ## Resources | Name | Type | |------|------| +| [aws_cloudwatch_metric_alarm.cache_cpu](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) | resource | +| [aws_cloudwatch_metric_alarm.cache_memory](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) | resource | | [aws_elasticache_parameter_group.default](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/elasticache_parameter_group) | resource | | [aws_elasticache_replication_group.redis](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/elasticache_replication_group) | resource | | [aws_elasticache_subnet_group.elasticache](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/elasticache_subnet_group) | resource | +| [aws_kms_ciphertext.slack_url](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/kms_ciphertext) | resource | +| [aws_kms_key.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/kms_key) | resource | +| [aws_lambda_permission.sns_lambda_slack_invoke](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_permission) | resource | | [aws_secretsmanager_secret.secret_redis](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/secretsmanager_secret) | resource | | [aws_security_group_rule.cidr_ingress](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/security_group_rule) | resource | | [aws_security_group_rule.default_ingress](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/security_group_rule) | resource | +| [aws_sns_topic.slack_topic](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/sns_topic) | resource | +| [aws_sns_topic_subscription.slack-endpoint](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/sns_topic_subscription) | resource | | [random_password.password](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/password) | resource | +| [archive_file.lambdazip](https://registry.terraform.io/providers/hashicorp/archive/latest/docs/data-sources/file) | data source | | [aws_availability_zones.available](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/availability_zones) | data source | ## Inputs | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| +| [alarm\_actions](#input\_alarm\_actions) | Alarm action list | `list(string)` | `[]` | no | +| [alarm\_cpu\_threshold\_percent](#input\_alarm\_cpu\_threshold\_percent) | CPU threshold alarm level | `number` | `75` | no | +| [alarm\_memory\_threshold\_bytes](#input\_alarm\_memory\_threshold\_bytes) | Ram threshold alarm level in bytes | `number` | `10000000` | no | | [allowed\_cidr\_blocks](#input\_allowed\_cidr\_blocks) | A list of CIDR blocks which are allowed to access the database | `list(any)` | `[]` | no | | [allowed\_security\_groups](#input\_allowed\_security\_groups) | A list of Security Group ID's to allow access to | `list(any)` | `[]` | no | | [at\_rest\_encryption\_enabled](#input\_at\_rest\_encryption\_enabled) | (Optional) Whether to enable encryption at rest | `bool` | `true` | no | | [automatic\_failover\_enabled](#input\_automatic\_failover\_enabled) | Enable automatic failover | `bool` | `true` | no | | [availability\_zones](#input\_availability\_zones) | The no. of AZs | `string` | `2` | no | +| [cloudwatch\_metric\_alarms\_enabled](#input\_cloudwatch\_metric\_alarms\_enabled) | Boolean flag to enable/disable CloudWatch metrics alarms | `bool` | `false` | no | +| [cw\_sns\_topic\_arn](#input\_cw\_sns\_topic\_arn) | The username to use when sending notifications to Slack. | `string` | `""` | no | | [engine\_log\_destination](#input\_engine\_log\_destination) | The destination for engine logs(eg. Cloudwatch log-group name or kinesis firehose stream name) | `string` | `null` | no | | [engine\_log\_destination\_type](#input\_engine\_log\_destination\_type) | The type of destination for engine logs(eg . cloudwatch-logs or kinesis-firehose) | `string` | `""` | no | | [engine\_log\_format](#input\_engine\_log\_format) | the format for logs eg. json/text | `string` | `"json"` | no | @@ -124,9 +147,13 @@ Security scanning is graciously provided by Prowler. Proowler is the leading ful | [node\_type](#input\_node\_type) | The instance size of the redis cluster | `string` | `"cache.t3.micro"` | no | | [notification\_topic\_arn](#input\_notification\_topic\_arn) | (Optional) ARN of an SNS topic to send ElastiCache notifications | `string` | `null` | no | | [num\_cache\_nodes](#input\_num\_cache\_nodes) | The number of cache nodes | `number` | `1` | no | +| [ok\_actions](#input\_ok\_actions) | The list of actions to execute when this alarm transitions into an OK state from any other state. Each action is specified as an Amazon Resource Number (ARN) | `list(string)` | `[]` | no | | [parameter\_group\_description](#input\_parameter\_group\_description) | Parameter group | `string` | `null` | no | | [port](#input\_port) | The redis port | `number` | `6379` | no | | [recovery\_window\_aws\_secret](#input\_recovery\_window\_aws\_secret) | Number of days that AWS Secrets Manager waits before it can delete the secret. This value can be 0 to force deletion without recovery or range from 7 to 30 days. | `number` | `0` | no | +| [slack\_channel](#input\_slack\_channel) | The Slack channel where notifications will be posted. | `string` | `""` | no | +| [slack\_username](#input\_slack\_username) | The username to use when sending notifications to Slack. | `string` | `""` | no | +| [slack\_webhook\_url](#input\_slack\_webhook\_url) | The Slack Webhook URL where notifications will be sent. | `string` | `""` | no | | [slow\_log\_destination](#input\_slow\_log\_destination) | The destination for slow logs(eg. Cloudwatch log-group name or kinesis firehose stream name.) | `string` | `null` | no | | [slow\_log\_destination\_type](#input\_slow\_log\_destination\_type) | The type of destination for slow logs(eg . cloudwatch-logs or kinesis-firehose) | `string` | `""` | no | | [slow\_log\_format](#input\_slow\_log\_format) | the format for logs eg. json/text | `string` | `"json"` | no | diff --git a/examples/complete/main.tf b/examples/complete/main.tf index 91ce41b..13bdab2 100644 --- a/examples/complete/main.tf +++ b/examples/complete/main.tf @@ -1,14 +1,14 @@ locals { name = "redis" - region = "us-east-1" + region = "us-east-2" family = "redis6.x" node_type = "cache.t3.small" - vpc_id = "vpc-06f1a2f3a7" - subnet_ids = ["subnet-0bb128ab", "subnet-0b54928666a"] - kms_key_arn = "arn:aws:kms:us-east-1:2222222222:key/bcfdc1c5-1bbbdb467d90" + vpc_id = "vpc-0220830b5260698db" + subnet_ids = ["subnet-0d4dee4a7ea31a96d", "subnet-07fdc14616382f833"] + kms_key_arn = "" environment = "prod" redis_engine_version = "6.0" - allowed_security_groups = ["sg-0e8dab08e40"] + allowed_security_groups = ["sg-02c3f55874f6e0c64"] additional_tags = { Owner = "Organization_Name" Expires = "Never" @@ -17,19 +17,25 @@ locals { } module "redis" { - source = "squareops/elasticache-redis/aws" - name = local.name - family = local.family - node_type = local.node_type - environment = local.environment - engine_version = local.redis_engine_version - num_cache_nodes = 2 - vpc_id = local.vpc_id - subnets = local.subnet_ids - kms_key_arn = local.kms_key_arn - multi_az_enabled = false - availability_zones = 2 - snapshot_window = "07:00-08:00" - maintenance_window = "sun:09:00-sun:10:00" - allowed_security_groups = local.allowed_security_groups + source = "squareops/elasticache-redis/aws" + name = local.name + family = local.family + node_type = local.node_type + environment = local.environment + engine_version = local.redis_engine_version + num_cache_nodes = 2 + vpc_id = local.vpc_id + subnets = local.subnet_ids + kms_key_arn = local.kms_key_arn + multi_az_enabled = false + availability_zones = 2 + snapshot_window = "07:00-08:00" + maintenance_window = "sun:09:00-sun:10:00" + allowed_security_groups = local.allowed_security_groups + cloudwatch_metric_alarms_enabled = true + alarm_cpu_threshold_percent = 70 + alarm_memory_threshold_bytes = "10000000" # in bytes + slack_username = "" + slack_channel = "" + slack_webhook_url = "" } diff --git a/lambda/README.md b/lambda/README.md new file mode 100644 index 0000000..6c252e1 --- /dev/null +++ b/lambda/README.md @@ -0,0 +1,59 @@ +## Lambda for SNS +![squareops_avatar] + +[squareops_avatar]: https://squareops.com/wp-content/uploads/2022/12/squareops-logo.png + +### [SquareOps Technologies](https://squareops.com/) Your DevOps Partner for Accelerating cloud journey. +
+ +Here is Lambda that calls the Slack webhook and passes the alarm message as the payload. + +## Requirements + +No requirements. + +## Providers + +| Name | Version | +|------|---------| +| [aws](#provider\_aws) | n/a | + +## Modules + +No modules. + +## Resources + +| Name | Type | +|------|------| +| [aws_cloudwatch_log_group.lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_log_group) | resource | +| [aws_iam_role.lambda_exec_role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource | +| [aws_iam_role_policy.lambda_cwl_policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | +| [aws_lambda_function.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_function) | resource | +| [aws_iam_policy_document.lambda_cwl_access](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | +| [aws_iam_policy_document.lambda_exec_role_policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [artifact\_file](#input\_artifact\_file) | The path to the function's deployment package within the local filesystem | `string` | `null` | no | +| [cwl\_retention\_days](#input\_cwl\_retention\_days) | The retention time in days for the CloudWatch Logs Stream. | `number` | `30` | no | +| [description](#input\_description) | Description of what the Lambda Function does. | `string` | `null` | no | +| [environment](#input\_environment) | The Lambda environment's configuration settings. | `map(string)` | `{}` | no | +| [handler](#input\_handler) | The function entrypoint in the code. | `string` | `"index.handler"` | no | +| [memory\_size](#input\_memory\_size) | Amount of memory in MB your Lambda Function can use at runtime. | `number` | `128` | no | +| [name](#input\_name) | A unique name for the Lambda Function. | `string` | n/a | yes | +| [runtime](#input\_runtime) | The Runtime used in the Lambda Function. | `string` | n/a | yes | +| [tags](#input\_tags) | A mapping of tags to assign to the module resources. | `map(string)` | `{}` | no | +| [timeout](#input\_timeout) | The amount of time your Lambda Function has to run in seconds. | `number` | `6` | no | + +## Outputs + +| Name | Description | +|------|-------------| +| [arn](#output\_arn) | The ARN identifying the Lambda Function. | +| [exec\_role\_id](#output\_exec\_role\_id) | The ID of the Function's IAM Role. | +| [invoke\_arn](#output\_invoke\_arn) | The ARN to be used for invoking Lambda Function from API Gateway. | +| [name](#output\_name) | The name of the Lambda Function. | + diff --git a/lambda/data.tf b/lambda/data.tf new file mode 100644 index 0000000..679463c --- /dev/null +++ b/lambda/data.tf @@ -0,0 +1,32 @@ +# Lambda Assume Role policy +data "aws_iam_policy_document" "lambda_exec_role_policy" { + statement { + sid = "LambdaExecRolePolicy" + effect = "Allow" + principals { + identifiers = [ + "lambda.amazonaws.com", + ] + type = "Service" + } + actions = [ + "sts:AssumeRole", + ] + } +} + +# Lambda CloudWatch Logs access +data "aws_iam_policy_document" "lambda_cwl_access" { + statement { + sid = "LambdaCreateCloudWatchLogGroup" + effect = "Allow" + actions = [ + "logs:PutLogEvents", + "logs:CreateLogStream", + "logs:CreateLogGroup" + ] + resources = [ + "arn:aws:logs:*:*:log-group:/aws/lambda/*:*:*" + ] + } +} diff --git a/lambda/iam.tf b/lambda/iam.tf new file mode 100644 index 0000000..b6e4760 --- /dev/null +++ b/lambda/iam.tf @@ -0,0 +1,10 @@ +resource "aws_iam_role" "lambda_exec_role" { + name = "${replace(title(var.name), "-", "")}LambdaExecRole" + assume_role_policy = data.aws_iam_policy_document.lambda_exec_role_policy.json +} + +resource "aws_iam_role_policy" "lambda_cwl_policy" { + name = "${replace(title(var.name), "-", "")}LambdaCWLogsPolicy" + role = aws_iam_role.lambda_exec_role.id + policy = data.aws_iam_policy_document.lambda_cwl_access.json +} diff --git a/lambda/main.tf b/lambda/main.tf new file mode 100644 index 0000000..1a845ef --- /dev/null +++ b/lambda/main.tf @@ -0,0 +1,26 @@ +resource "aws_cloudwatch_log_group" "lambda" { + name = "/aws/lambda/${var.name}" + retention_in_days = var.cwl_retention_days + tags = var.tags +} + +resource "aws_lambda_function" "this" { + function_name = var.name + description = var.description + filename = var.artifact_file + source_code_hash = var.artifact_file != null ? filebase64sha256(var.artifact_file) : null + role = aws_iam_role.lambda_exec_role.arn + handler = var.handler + runtime = var.runtime + memory_size = var.memory_size + timeout = var.timeout + + dynamic "environment" { + for_each = (length(var.environment) > 0 ? [1] : []) + content { + variables = var.environment + } + } + + tags = var.tags +} diff --git a/lambda/outputs.tf b/lambda/outputs.tf new file mode 100644 index 0000000..e97e7ec --- /dev/null +++ b/lambda/outputs.tf @@ -0,0 +1,19 @@ +output "name" { + description = "The name of the Lambda Function." + value = aws_lambda_function.this.function_name +} + +output "arn" { + description = "The ARN identifying the Lambda Function." + value = aws_lambda_function.this.arn +} + +output "invoke_arn" { + description = "The ARN to be used for invoking Lambda Function from API Gateway." + value = aws_lambda_function.this.invoke_arn +} + +output "exec_role_id" { + description = "The ID of the Function's IAM Role." + value = aws_iam_role.lambda_exec_role.id +} diff --git a/lambda/sns_slack.py b/lambda/sns_slack.py new file mode 100644 index 0000000..1351c7e --- /dev/null +++ b/lambda/sns_slack.py @@ -0,0 +1,51 @@ +import json +import re +import os +import boto3 +import urllib3 + +# Lambda global variables +region = os.environ["AWS_REGION"] # from Lambda default envs +slack_url = os.environ["SLACK_URL"] +slack_channel = os.environ["SLACK_CHANNEL"] +slack_user = os.environ["SLACK_USER"] + + +http = urllib3.PoolManager() +def format_cloudwatch_alarm_message(event): + alarm_data = json.loads(event['Records'][0]['Sns']['Message']) + + alarm_name = alarm_data["AlarmName"] + alarm_description = alarm_data["AlarmDescription"] + new_state = alarm_data["NewStateValue"] + reason = alarm_data["NewStateReason"] + metric_name = alarm_data["Trigger"]["MetricName"] + threshold = alarm_data["Trigger"]["Threshold"] + + message = f"*:exclamation: CloudWatch Alarm Alert :exclamation:*\n\n" + message += f" *Alarm Name:* {alarm_name}\n" + message += f" *Description:* _{alarm_description}_\n" + message += f" *New State:* {new_state}\n" + message += f" *Reason:* _{reason}_\n" + message += f" *Metric Name:* {metric_name}\n" + message += f" *Threshold:* {threshold}\n" + + return message + +def lambda_handler(event, context): + url = slack_url + msg = { + "channel": slack_channel, + "username": slack_user, + "text": format_cloudwatch_alarm_message(event), + "icon_emoji": ":cloudwatch:" + } + + encoded_msg = json.dumps(msg).encode('utf-8') + resp = http.request('POST', url, body=encoded_msg) + + print({ + "message": msg, + "status_code": resp.status, + "response": resp.data + }) diff --git a/lambda/variables.tf b/lambda/variables.tf new file mode 100644 index 0000000..ba1013b --- /dev/null +++ b/lambda/variables.tf @@ -0,0 +1,57 @@ +variable "artifact_file" { + type = string + description = "The path to the function's deployment package within the local filesystem" + default = null +} + +variable "handler" { + type = string + description = "The function entrypoint in the code." + default = "index.handler" +} + +variable "memory_size" { + type = number + description = "Amount of memory in MB your Lambda Function can use at runtime." + default = 128 +} + +variable "timeout" { + type = number + description = "The amount of time your Lambda Function has to run in seconds." + default = 6 +} + +variable "description" { + type = string + description = "Description of what the Lambda Function does." + default = null +} + +variable "environment" { + type = map(string) + description = "The Lambda environment's configuration settings." + default = {} +} + +variable "cwl_retention_days" { + type = number + description = "The retention time in days for the CloudWatch Logs Stream." + default = 30 +} + +variable "tags" { + description = "A mapping of tags to assign to the module resources." + type = map(string) + default = {} +} + +variable "name" { + type = string + description = "A unique name for the Lambda Function." +} + +variable "runtime" { + type = string + description = "The Runtime used in the Lambda Function." +} diff --git a/main.tf b/main.tf index 34ba6e2..547afa3 100644 --- a/main.tf +++ b/main.tf @@ -48,7 +48,6 @@ resource "aws_elasticache_replication_group" "redis" { multi_az_enabled = var.multi_az_enabled kms_key_id = var.kms_key_arn auth_token = var.transit_encryption_enabled ? random_password.password.result : null - at_rest_encryption_enabled = var.at_rest_encryption_enabled transit_encryption_enabled = var.transit_encryption_enabled notification_topic_arn = var.notification_topic_arn maintenance_window = var.maintenance_window @@ -146,3 +145,138 @@ resource "aws_secretsmanager_secret" "secret_redis" { ) recovery_window_in_days = var.recovery_window_aws_secret } + +# Cloudwatch alarms +resource "aws_cloudwatch_metric_alarm" "cache_cpu" { + count = var.cloudwatch_metric_alarms_enabled ? 1 : 0 + alarm_name = format("%s-%s-%s", var.environment, var.name, "cpu-utilization") + alarm_description = "Redis cluster CPU utilization" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = "1" + metric_name = "CPUUtilization" + namespace = "AWS/ElastiCache" + period = "300" + statistic = "Average" + + threshold = var.alarm_cpu_threshold_percent + + dimensions = { + CacheClusterId = aws_elasticache_replication_group.redis.id + } + + alarm_actions = [aws_sns_topic.slack_topic[0].arn] + ok_actions = [aws_sns_topic.slack_topic[0].arn] + depends_on = [aws_sns_topic.slack_topic] + + tags = merge( + { "Name" = format("%s-%s-%s", var.environment, var.name, "cpu_metric") }, + local.tags, + ) +} + +resource "aws_cloudwatch_metric_alarm" "cache_memory" { + count = var.cloudwatch_metric_alarms_enabled ? 1 : 0 + alarm_name = format("%s-%s-%s", var.environment, var.name, "used-memory") + alarm_description = "Redis cluster freeable memory" + comparison_operator = "LessThanThreshold" + evaluation_periods = "1" + metric_name = "FreeableMemory" + namespace = "AWS/ElastiCache" + period = "60" + statistic = "Average" + + threshold = var.alarm_memory_threshold_bytes + + dimensions = { + CacheClusterId = aws_elasticache_replication_group.redis.id + } + + alarm_actions = [aws_sns_topic.slack_topic[0].arn] + ok_actions = [aws_sns_topic.slack_topic[0].arn] + depends_on = [aws_sns_topic.slack_topic] + + tags = merge( + { "Name" = format("%s-%s-%s", var.environment, var.name, "memory-metric") }, + local.tags, + ) +} + +resource "aws_kms_key" "this" { + count = var.cloudwatch_metric_alarms_enabled ? 1 : 0 + description = "KMS key for notify-slack test" +} + +resource "aws_kms_ciphertext" "slack_url" { + count = var.cloudwatch_metric_alarms_enabled ? 1 : 0 + plaintext = var.slack_webhook_url + key_id = aws_kms_key.this[0].arn +} + +resource "aws_sns_topic" "slack_topic" { + count = var.cloudwatch_metric_alarms_enabled ? 1 : 0 + depends_on = [aws_elasticache_replication_group.redis] + name = format("%s-%s-%s", var.environment, var.name, "slack-topic") + delivery_policy = <