From 892f03d6b48b4da73d8be62053f6183f17ff5a7b Mon Sep 17 00:00:00 2001 From: johnctitus Date: Mon, 14 Jan 2019 12:55:20 -0600 Subject: [PATCH] Implement CloudWatch Alarm Module --- README.md | 4 +- examples/custom_cw_agent_config.tf | 2 +- examples/main.tf | 2 +- examples/unmanaged.tf | 2 +- main.tf | 174 +++++++++++++---------------- tests/test1/main.tf | 44 ++++---- variables.tf | 2 +- 7 files changed, 103 insertions(+), 127 deletions(-) diff --git a/README.md b/README.md index 695e848..2fd6b52 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ This module creates one or more autorecovery instances. ``` module "ar" { - source = "git@github.com:rackspace-infrastructure-automation/aws-terraform-ec2_autorecovery//?ref=v0.0.2" + source = "git@github.com:rackspace-infrastructure-automation/aws-terraform-ec2_autorecovery//?ref=v0.0.10" ec2_os = "amazon" subnets = ["${module.vpc.private_subnets}"] @@ -25,7 +25,6 @@ Full working references are available at [examples](examples) | additional\_ssm\_bootstrap\_list | A list of maps consisting of main step actions, to be appended to SSM associations. Please see usage.tf.example in this repo for examples. | list | `` | no | | additional\_ssm\_bootstrap\_step\_count | Count of steps added for input 'additional_ssm_bootstrap_list'. This is required since 'additional_ssm_bootstrap_list' is a list of maps | string | `"0"` | no | | additional\_tags | Additional tags to be added to the EC2 instance Please see usage.tf.example in this repo for examples. | map | `` | no | -| alarm\_notification\_topic | SNS Topic ARN to notify if there are any alarms | string | `""` | no | | backup\_tag\_value | Value of the 'Backup' tag, used to assign te EBSSnapper configuration | string | `"False"` | no | | cloudwatch\_log\_retention | The number of days to retain Cloudwatch Logs for this instance. | string | `"30"` | no | | creation\_policy\_timeout | Time to wait for the number of signals for the creation policy. H/M/S Hours/Minutes/Seconds | string | `"20m"` | no | @@ -56,6 +55,7 @@ Full working references are available at [examples](examples) | instance\_role\_managed\_policy\_arns | List of IAM policy ARNs for the InstanceRole IAM role. IAM ARNs can be found within the Policies section of the AWS IAM console. e.g. ['arn:aws:iam::aws:policy/AmazonEC2FullAccess', 'arn:aws:iam::aws:policy/service-role/AmazonEC2RoleforSSM', 'arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetRole'] | list | `` | no | | instance\_type | EC2 Instance Type e.g. 't2.micro' | string | `"t2.micro"` | no | | key\_pair | Name of an existing EC2 KeyPair to enable SSH access to the instances. | string | `""` | no | +| notification\_topic | SNS Topic ARN to notify if there are any alarms | string | `""` | no | | perform\_ssm\_inventory\_tag | Determines whether Instance is tracked via System Manager Inventory. | string | `"True"` | no | | primary\_ebs\_volume\_iops | Iops value required for use with io1 EBS volumes. This value should be 3 times the EBS volume size | string | `"0"` | no | | primary\_ebs\_volume\_size | EBS Volume Size in GB | string | `"60"` | no | diff --git a/examples/custom_cw_agent_config.tf b/examples/custom_cw_agent_config.tf index 0a77dce..d0255ca 100644 --- a/examples/custom_cw_agent_config.tf +++ b/examples/custom_cw_agent_config.tf @@ -18,7 +18,7 @@ module "vpc" { data "aws_region" "current_region" {} module "ec2_ar_with_codedeploy" { - source = "git@github.com:rackspace-infrastructure-automation/aws-terraform-ec2_autorecovery?ref=v0.0.8" + source = "git@github.com:rackspace-infrastructure-automation/aws-terraform-ec2_autorecovery?ref=v0.0.10" ec2_os = "rhel6" instance_count = "1" subnets = "${module.vpc.private_subnets}" diff --git a/examples/main.tf b/examples/main.tf index 94dd972..1b6c375 100644 --- a/examples/main.tf +++ b/examples/main.tf @@ -28,7 +28,7 @@ data "aws_ami" "amazon_centos_7" { } module "ec2_ar" { - source = "git@github.com:rackspace-infrastructure-automation/aws-terraform-ec2_autorecovery?ref=v0.0.8" + source = "git@github.com:rackspace-infrastructure-automation/aws-terraform-ec2_autorecovery?ref=v0.0.10" ec2_os = "centos7" instance_count = "3" subnets = "${module.vpc.public_subnets}" diff --git a/examples/unmanaged.tf b/examples/unmanaged.tf index 28ce467..29b8706 100644 --- a/examples/unmanaged.tf +++ b/examples/unmanaged.tf @@ -33,7 +33,7 @@ module "sns" { } module "unmanaged_ar" { - source = "git@github.com:rackspace-infrastructure-automation/aws-terraform-ec2_autorecovery?ref=v0.0.8" + source = "git@github.com:rackspace-infrastructure-automation/aws-terraform-ec2_autorecovery?ref=v0.0.10" ec2_os = "centos7" instance_count = "1" diff --git a/main.tf b/main.tf index 8699398..9cfc278 100644 --- a/main.tf +++ b/main.tf @@ -7,7 +7,7 @@ * *``` *module "ar" { - * source = "git@github.com:rackspace-infrastructure-automation/aws-terraform-ec2_autorecovery//?ref=v0.0.2" + * source = "git@github.com:rackspace-infrastructure-automation/aws-terraform-ec2_autorecovery//?ref=v0.0.10" * * ec2_os = "amazon" * subnets = ["${module.vpc.private_subnets}"] @@ -20,14 +20,6 @@ * Full working references are available at [examples](examples) */ -resource "random_string" "r_string" { - length = 16 - upper = true - lower = false - number = true - special = false -} - locals { user_data_map = { amazon = "amazon_linux_userdata.sh" @@ -115,24 +107,6 @@ EOF disabled = "" } - alarm_sns_notification = "${compact(list(var.alarm_notification_topic))}" - - alarm_emergency_ticket = [ - "arn:aws:sns:${data.aws_region.current_region.name}:${data.aws_caller_identity.current_account.account_id}:rackspace-support-emergency", - ] - - recovery_action = "${var.rackspace_managed ? "managed" : "unmanaged"}" - - recovery_alarm_action = { - managed = "${local.alarm_emergency_ticket}" - unmanaged = "${local.alarm_sns_notification}" - } - - recovery_ok_action = { - managed = "${local.alarm_emergency_ticket}" - unmanaged = [] - } - ami_owner_mapping = { amazon = "137112412989" amazon2 = "137112412989" @@ -377,106 +351,108 @@ resource "aws_cloudwatch_log_group" "application_logs" { retention_in_days = "${var.cloudwatch_log_retention}" } -resource "aws_cloudwatch_metric_alarm" "status_check_failed_system_alarm_ticket" { - count = "${var.instance_count}" - alarm_name = "${join("-", list("StatusCheckFailedSystemAlarmTicket", var.resource_name, format("%03d",count.index+1)))}" - alarm_description = "Status checks have failed for system, generating ticket." - namespace = "AWS/EC2" - statistic = "Minimum" - comparison_operator = "GreaterThanThreshold" - threshold = "0" - unit = "Count" - evaluation_periods = "2" - period = "60" - metric_name = "StatusCheckFailed_System" - ok_actions = ["${local.recovery_ok_action[local.recovery_action]}"] - alarm_actions = ["${local.recovery_alarm_action[local.recovery_action]}"] +data "null_data_source" "alarm_dimensions" { + count = "${var.instance_count}" - dimensions { - # coalescelist and list("novalue") were used here due to element not being able to handle empty lists, even if conditional will not allow portion to execute + inputs = { InstanceId = "${element(coalescelist(aws_instance.mod_ec2_instance_with_secondary_ebs.*.id, aws_instance.mod_ec2_instance_no_secondary_ebs.*.id), count.index)}" } } +module "status_check_failed_system_alarm_ticket" { + source = "git@github.com:rackspace-infrastructure-automation/aws-terraform-cloudwatch_alarm//?ref=v0.0.1" + + alarm_count = "${var.instance_count}" + alarm_description = "Status checks have failed for system, generating ticket." + alarm_name = "${join("-", list("StatusCheckFailedSystemAlarmTicket", var.resource_name))}" + comparison_operator = "GreaterThanThreshold" + dimensions = "${data.null_data_source.alarm_dimensions.*.outputs}" + evaluation_periods = "2" + notification_topic = ["${var.notification_topic}"] + metric_name = "StatusCheckFailed_System" + rackspace_alarms_enabled = true + rackspace_managed = "${var.rackspace_managed}" + namespace = "AWS/EC2" + period = "60" + severity = "emergency" + statistic = "Minimum" + threshold = "0" + unit = "Count" +} + resource "aws_cloudwatch_metric_alarm" "status_check_failed_instance_alarm_reboot" { count = "${var.enable_recovery_alarms ? var.instance_count : 0}" - alarm_name = "${join("-", list("StatusCheckFailedInstanceAlarmReboot", var.resource_name, format("%03d",count.index+1)))}" alarm_description = "Status checks have failed, rebooting system." + alarm_name = "${join("-", list("StatusCheckFailedInstanceAlarmReboot", var.resource_name, format("%03d",count.index+1)))}" + comparison_operator = "GreaterThanThreshold" + dimensions = "${data.null_data_source.alarm_dimensions.*.outputs[count.index]}" + evaluation_periods = "5" + metric_name = "StatusCheckFailed_Instance" namespace = "AWS/EC2" + period = "60" statistic = "Minimum" - comparison_operator = "GreaterThanThreshold" threshold = "0" unit = "Count" - evaluation_periods = "5" - period = "60" - metric_name = "StatusCheckFailed_Instance" - alarm_actions = ["arn:aws:swf:${data.aws_region.current_region.name}:${data.aws_caller_identity.current_account.account_id}:action/actions/AWS_EC2.InstanceId.Reboot/1.0"] - dimensions { - # coalescelist and list("novalue") were used here due to element not being able to handle empty lists, even if conditional will not allow portion to execute - InstanceId = "${element(coalescelist(aws_instance.mod_ec2_instance_with_secondary_ebs.*.id, aws_instance.mod_ec2_instance_no_secondary_ebs.*.id), count.index)}" - } + alarm_actions = ["arn:aws:swf:${data.aws_region.current_region.name}:${data.aws_caller_identity.current_account.account_id}:action/actions/AWS_EC2.InstanceId.Reboot/1.0"] } resource "aws_cloudwatch_metric_alarm" "status_check_failed_system_alarm_recover" { count = "${var.enable_recovery_alarms ? var.instance_count : 0}" - alarm_name = "${join("-", list("StatusCheckFailedSystemAlarmRecover", var.resource_name, format("%03d",count.index+1)))}" alarm_description = "Status checks have failed for system, recovering instance" - namespace = "AWS/EC2" - statistic = "Minimum" + alarm_name = "${join("-", list("StatusCheckFailedSystemAlarmRecover", var.resource_name, format("%03d",count.index+1)))}" comparison_operator = "GreaterThanThreshold" - threshold = "0" - unit = "Count" + dimensions = "${data.null_data_source.alarm_dimensions.*.outputs[count.index]}" evaluation_periods = "2" - period = "60" metric_name = "StatusCheckFailed_System" - alarm_actions = ["arn:aws:automate:${data.aws_region.current_region.name}:ec2:recover"] - - dimensions { - # coalescelist and list("novalue") were used here due to element not being able to handle empty lists, even if conditional will not allow portion to execute - InstanceId = "${element(coalescelist(aws_instance.mod_ec2_instance_with_secondary_ebs.*.id, aws_instance.mod_ec2_instance_no_secondary_ebs.*.id), count.index)}" - } -} - -resource "aws_cloudwatch_metric_alarm" "status_check_failed_instance_alarm_ticket" { - count = "${var.instance_count}" - alarm_name = "${join("-", list("StatusCheckFailedInstanceAlarmTicket", var.resource_name, format("%03d",count.index+1)))}" - alarm_description = "Status checks have failed, generating ticket." namespace = "AWS/EC2" + period = "60" statistic = "Minimum" - comparison_operator = "GreaterThanThreshold" threshold = "0" unit = "Count" - evaluation_periods = "10" - period = "60" - metric_name = "StatusCheckFailed_Instance" - ok_actions = ["${local.recovery_ok_action[local.recovery_action]}"] - alarm_actions = ["${local.recovery_alarm_action[local.recovery_action]}"] - dimensions { - # coalescelist and list("novalue") were used here due to element not being able to handle empty lists, even if conditional will not allow portion to execute - InstanceId = "${element(coalescelist(aws_instance.mod_ec2_instance_with_secondary_ebs.*.id, aws_instance.mod_ec2_instance_no_secondary_ebs.*.id), count.index)}" - } + alarm_actions = ["arn:aws:automate:${data.aws_region.current_region.name}:ec2:recover"] } -resource "aws_cloudwatch_metric_alarm" "cpu_alarm_high" { - count = "${var.instance_count}" - alarm_name = "${join("-", list("CPUAlarmHigh", var.resource_name, format("%03d",count.index+1)))}" - alarm_description = "CPU Alarm ${var.cw_cpu_high_operator} ${var.cw_cpu_high_threshold}% for ${var.cw_cpu_high_period} seconds ${var.cw_cpu_high_evaluations} times." - namespace = "AWS/EC2" - statistic = "Average" - comparison_operator = "${var.cw_cpu_high_operator}" - threshold = "${var.cw_cpu_high_threshold}" - evaluation_periods = "${var.cw_cpu_high_evaluations}" - period = "${var.cw_cpu_high_period}" - metric_name = "CPUUtilization" - ok_actions = [] - alarm_actions = ["${compact(list(var.alarm_notification_topic))}"] - - dimensions { - # coalescelist and list("novalue") were used here due to element not being able to handle empty lists, even if conditional will not allow portion to execute - InstanceId = "${element(coalescelist(aws_instance.mod_ec2_instance_with_secondary_ebs.*.id, aws_instance.mod_ec2_instance_no_secondary_ebs.*.id), count.index)}" - } +module "status_check_failed_instance_alarm_ticket" { + source = "git@github.com:rackspace-infrastructure-automation/aws-terraform-cloudwatch_alarm//?ref=v0.0.1" + + alarm_count = "${var.instance_count}" + alarm_description = "Status checks have failed, generating ticket." + alarm_name = "${join("-", list("StatusCheckFailedInstanceAlarmTicket", var.resource_name))}" + comparison_operator = "GreaterThanThreshold" + dimensions = "${data.null_data_source.alarm_dimensions.*.outputs}" + evaluation_periods = "10" + metric_name = "StatusCheckFailed_Instance" + notification_topic = ["${var.notification_topic}"] + namespace = "AWS/EC2" + period = "60" + rackspace_alarms_enabled = true + rackspace_managed = "${var.rackspace_managed}" + severity = "emergency" + statistic = "Minimum" + threshold = "0" + unit = "Count" +} + +module "cpu_alarm_high" { + source = "git@github.com:rackspace-infrastructure-automation/aws-terraform-cloudwatch_alarm//?ref=v0.0.1" + + alarm_count = "${var.instance_count}" + alarm_description = "CPU Alarm ${var.cw_cpu_high_operator} ${var.cw_cpu_high_threshold}% for ${var.cw_cpu_high_period} seconds ${var.cw_cpu_high_evaluations} times." + alarm_name = "${join("-", list("CPUAlarmHigh", var.resource_name))}" + comparison_operator = "${var.cw_cpu_high_operator}" + customer_alarms_enabled = true + dimensions = "${data.null_data_source.alarm_dimensions.*.outputs}" + evaluation_periods = "${var.cw_cpu_high_evaluations}" + metric_name = "CPUUtilization" + notification_topic = ["${var.notification_topic}"] + namespace = "AWS/EC2" + period = "${var.cw_cpu_high_period}" + rackspace_alarms_enabled = false + rackspace_managed = "${var.rackspace_managed}" + statistic = "Average" + threshold = "${var.cw_cpu_high_threshold}" } # diff --git a/tests/test1/main.tf b/tests/test1/main.tf index be6d8ae..0b49fd7 100644 --- a/tests/test1/main.tf +++ b/tests/test1/main.tf @@ -62,7 +62,7 @@ module "ec2_ar_centos7_with_codedeploy" { cloudwatch_log_retention = "30" ssm_association_refresh_rate = "rate(1 day)" additional_ssm_bootstrap_step_count = "2" - alarm_notification_topic = "" + notification_topic = "" disable_api_termination = "False" t2_unlimited_mode = "standard" creation_policy_timeout = "20m" @@ -159,7 +159,7 @@ module "ec2_ar_centos7_no_codedeploy" { cloudwatch_log_retention = "30" ssm_association_refresh_rate = "rate(1 day)" additional_ssm_bootstrap_step_count = "2" - alarm_notification_topic = "" + notification_topic = "" disable_api_termination = "False" t2_unlimited_mode = "standard" creation_policy_timeout = "20m" @@ -250,7 +250,7 @@ module "ec2_ar_windows_with_codedeploy" { cloudwatch_log_retention = "30" ssm_association_refresh_rate = "rate(1 day)" additional_ssm_bootstrap_step_count = "2" - alarm_notification_topic = "" + notification_topic = "" disable_api_termination = "False" t2_unlimited_mode = "standard" creation_policy_timeout = "20m" @@ -337,7 +337,7 @@ module "ec2_ar_windows_no_codedeploy" { cloudwatch_log_retention = "30" ssm_association_refresh_rate = "rate(1 day)" additional_ssm_bootstrap_step_count = "2" - alarm_notification_topic = "" + notification_topic = "" disable_api_termination = "False" t2_unlimited_mode = "standard" creation_policy_timeout = "20m" @@ -393,29 +393,29 @@ module "sns" { module "unmanaged_ar" { source = "../../module" - ec2_os = "centos7" - instance_count = "1" - subnets = ["${element(module.vpc.private_subnets, 0)}"] - security_group_list = ["${module.vpc.default_sg}"] - image_id = "${data.aws_ami.amazon_centos_7.image_id}" - instance_type = "t2.micro" - resource_name = "my_unmanaged_instance-${random_string.res_name.result}" - alarm_notification_topic = "${module.sns.topic_arn}" - rackspace_managed = false + ec2_os = "centos7" + instance_count = "1" + subnets = ["${element(module.vpc.private_subnets, 0)}"] + security_group_list = ["${module.vpc.default_sg}"] + image_id = "${data.aws_ami.amazon_centos_7.image_id}" + instance_type = "t2.micro" + resource_name = "my_unmanaged_instance-${random_string.res_name.result}" + notification_topic = "${module.sns.topic_arn}" + rackspace_managed = false } module "zero_count_ar" { source = "../../module" - ec2_os = "centos7" - instance_count = "0" - subnets = [] - security_group_list = ["${module.vpc.default_sg}"] - image_id = "${data.aws_ami.amazon_centos_7.image_id}" - instance_type = "t2.micro" - resource_name = "my_nonexistent_instance-${random_string.res_name.result}" - alarm_notification_topic = "${module.sns.topic_arn}" - rackspace_managed = false + ec2_os = "centos7" + instance_count = "0" + subnets = [] + security_group_list = ["${module.vpc.default_sg}"] + image_id = "${data.aws_ami.amazon_centos_7.image_id}" + instance_type = "t2.micro" + resource_name = "my_nonexistent_instance-${random_string.res_name.result}" + notification_topic = "${module.sns.topic_arn}" + rackspace_managed = false } module "ec2_nfs" { diff --git a/variables.tf b/variables.tf index b46f79d..f86b849 100644 --- a/variables.tf +++ b/variables.tf @@ -235,7 +235,7 @@ variable "ssm_patching_group" { # CloudWatch and Logs # -variable "alarm_notification_topic" { +variable "notification_topic" { description = "SNS Topic ARN to notify if there are any alarms" type = "string" default = ""