Skip to content

Commit

Permalink
Merge pull request #40 from rackspace-infrastructure-automation/191-c…
Browse files Browse the repository at this point in the history
…walarm-testing

Implement CloudWatch Alarm Module
  • Loading branch information
John Titus committed Jan 18, 2019
2 parents 3033cfa + 892f03d commit 23ccd75
Show file tree
Hide file tree
Showing 7 changed files with 103 additions and 127 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ This module creates one or more autorecovery instances.

```
module "ar" {
source = "git@github.com:rackspace-infrastructure-automation/aws-terraform-ec2_autorecovery//?ref=v0.0.2"
source = "git@github.com:rackspace-infrastructure-automation/aws-terraform-ec2_autorecovery//?ref=v0.0.10"
ec2_os = "amazon"
subnets = ["${module.vpc.private_subnets}"]
Expand All @@ -25,7 +25,6 @@ Full working references are available at [examples](examples)
| additional\_ssm\_bootstrap\_list | A list of maps consisting of main step actions, to be appended to SSM associations. Please see usage.tf.example in this repo for examples. | list | `<list>` | no |
| additional\_ssm\_bootstrap\_step\_count | Count of steps added for input 'additional_ssm_bootstrap_list'. This is required since 'additional_ssm_bootstrap_list' is a list of maps | string | `"0"` | no |
| additional\_tags | Additional tags to be added to the EC2 instance Please see usage.tf.example in this repo for examples. | map | `<map>` | no |
| alarm\_notification\_topic | SNS Topic ARN to notify if there are any alarms | string | `""` | no |
| backup\_tag\_value | Value of the 'Backup' tag, used to assign te EBSSnapper configuration | string | `"False"` | no |
| cloudwatch\_log\_retention | The number of days to retain Cloudwatch Logs for this instance. | string | `"30"` | no |
| creation\_policy\_timeout | Time to wait for the number of signals for the creation policy. H/M/S Hours/Minutes/Seconds | string | `"20m"` | no |
Expand Down Expand Up @@ -56,6 +55,7 @@ Full working references are available at [examples](examples)
| instance\_role\_managed\_policy\_arns | List of IAM policy ARNs for the InstanceRole IAM role. IAM ARNs can be found within the Policies section of the AWS IAM console. e.g. ['arn:aws:iam::aws:policy/AmazonEC2FullAccess', 'arn:aws:iam::aws:policy/service-role/AmazonEC2RoleforSSM', 'arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetRole'] | list | `<list>` | no |
| instance\_type | EC2 Instance Type e.g. 't2.micro' | string | `"t2.micro"` | no |
| key\_pair | Name of an existing EC2 KeyPair to enable SSH access to the instances. | string | `""` | no |
| notification\_topic | SNS Topic ARN to notify if there are any alarms | string | `""` | no |
| perform\_ssm\_inventory\_tag | Determines whether Instance is tracked via System Manager Inventory. | string | `"True"` | no |
| primary\_ebs\_volume\_iops | Iops value required for use with io1 EBS volumes. This value should be 3 times the EBS volume size | string | `"0"` | no |
| primary\_ebs\_volume\_size | EBS Volume Size in GB | string | `"60"` | no |
Expand Down
2 changes: 1 addition & 1 deletion examples/custom_cw_agent_config.tf
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ module "vpc" {
data "aws_region" "current_region" {}

module "ec2_ar_with_codedeploy" {
source = "git@github.com:rackspace-infrastructure-automation/aws-terraform-ec2_autorecovery?ref=v0.0.8"
source = "git@github.com:rackspace-infrastructure-automation/aws-terraform-ec2_autorecovery?ref=v0.0.10"
ec2_os = "rhel6"
instance_count = "1"
subnets = "${module.vpc.private_subnets}"
Expand Down
2 changes: 1 addition & 1 deletion examples/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ data "aws_ami" "amazon_centos_7" {
}

module "ec2_ar" {
source = "git@github.com:rackspace-infrastructure-automation/aws-terraform-ec2_autorecovery?ref=v0.0.8"
source = "git@github.com:rackspace-infrastructure-automation/aws-terraform-ec2_autorecovery?ref=v0.0.10"
ec2_os = "centos7"
instance_count = "3"
subnets = "${module.vpc.public_subnets}"
Expand Down
2 changes: 1 addition & 1 deletion examples/unmanaged.tf
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ module "sns" {
}

module "unmanaged_ar" {
source = "git@github.com:rackspace-infrastructure-automation/aws-terraform-ec2_autorecovery?ref=v0.0.8"
source = "git@github.com:rackspace-infrastructure-automation/aws-terraform-ec2_autorecovery?ref=v0.0.10"

ec2_os = "centos7"
instance_count = "1"
Expand Down
174 changes: 75 additions & 99 deletions main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
*
*```
*module "ar" {
* source = "git@github.com:rackspace-infrastructure-automation/aws-terraform-ec2_autorecovery//?ref=v0.0.2"
* source = "git@github.com:rackspace-infrastructure-automation/aws-terraform-ec2_autorecovery//?ref=v0.0.10"
*
* ec2_os = "amazon"
* subnets = ["${module.vpc.private_subnets}"]
Expand All @@ -20,14 +20,6 @@
* Full working references are available at [examples](examples)
*/

resource "random_string" "r_string" {
length = 16
upper = true
lower = false
number = true
special = false
}

locals {
user_data_map = {
amazon = "amazon_linux_userdata.sh"
Expand Down Expand Up @@ -115,24 +107,6 @@ EOF
disabled = ""
}

alarm_sns_notification = "${compact(list(var.alarm_notification_topic))}"

alarm_emergency_ticket = [
"arn:aws:sns:${data.aws_region.current_region.name}:${data.aws_caller_identity.current_account.account_id}:rackspace-support-emergency",
]

recovery_action = "${var.rackspace_managed ? "managed" : "unmanaged"}"

recovery_alarm_action = {
managed = "${local.alarm_emergency_ticket}"
unmanaged = "${local.alarm_sns_notification}"
}

recovery_ok_action = {
managed = "${local.alarm_emergency_ticket}"
unmanaged = []
}

ami_owner_mapping = {
amazon = "137112412989"
amazon2 = "137112412989"
Expand Down Expand Up @@ -377,106 +351,108 @@ resource "aws_cloudwatch_log_group" "application_logs" {
retention_in_days = "${var.cloudwatch_log_retention}"
}

resource "aws_cloudwatch_metric_alarm" "status_check_failed_system_alarm_ticket" {
count = "${var.instance_count}"
alarm_name = "${join("-", list("StatusCheckFailedSystemAlarmTicket", var.resource_name, format("%03d",count.index+1)))}"
alarm_description = "Status checks have failed for system, generating ticket."
namespace = "AWS/EC2"
statistic = "Minimum"
comparison_operator = "GreaterThanThreshold"
threshold = "0"
unit = "Count"
evaluation_periods = "2"
period = "60"
metric_name = "StatusCheckFailed_System"
ok_actions = ["${local.recovery_ok_action[local.recovery_action]}"]
alarm_actions = ["${local.recovery_alarm_action[local.recovery_action]}"]
data "null_data_source" "alarm_dimensions" {
count = "${var.instance_count}"

dimensions {
# coalescelist and list("novalue") were used here due to element not being able to handle empty lists, even if conditional will not allow portion to execute
inputs = {
InstanceId = "${element(coalescelist(aws_instance.mod_ec2_instance_with_secondary_ebs.*.id, aws_instance.mod_ec2_instance_no_secondary_ebs.*.id), count.index)}"
}
}

module "status_check_failed_system_alarm_ticket" {
source = "git@github.com:rackspace-infrastructure-automation/aws-terraform-cloudwatch_alarm//?ref=v0.0.1"

alarm_count = "${var.instance_count}"
alarm_description = "Status checks have failed for system, generating ticket."
alarm_name = "${join("-", list("StatusCheckFailedSystemAlarmTicket", var.resource_name))}"
comparison_operator = "GreaterThanThreshold"
dimensions = "${data.null_data_source.alarm_dimensions.*.outputs}"
evaluation_periods = "2"
notification_topic = ["${var.notification_topic}"]
metric_name = "StatusCheckFailed_System"
rackspace_alarms_enabled = true
rackspace_managed = "${var.rackspace_managed}"
namespace = "AWS/EC2"
period = "60"
severity = "emergency"
statistic = "Minimum"
threshold = "0"
unit = "Count"
}

resource "aws_cloudwatch_metric_alarm" "status_check_failed_instance_alarm_reboot" {
count = "${var.enable_recovery_alarms ? var.instance_count : 0}"
alarm_name = "${join("-", list("StatusCheckFailedInstanceAlarmReboot", var.resource_name, format("%03d",count.index+1)))}"
alarm_description = "Status checks have failed, rebooting system."
alarm_name = "${join("-", list("StatusCheckFailedInstanceAlarmReboot", var.resource_name, format("%03d",count.index+1)))}"
comparison_operator = "GreaterThanThreshold"
dimensions = "${data.null_data_source.alarm_dimensions.*.outputs[count.index]}"
evaluation_periods = "5"
metric_name = "StatusCheckFailed_Instance"
namespace = "AWS/EC2"
period = "60"
statistic = "Minimum"
comparison_operator = "GreaterThanThreshold"
threshold = "0"
unit = "Count"
evaluation_periods = "5"
period = "60"
metric_name = "StatusCheckFailed_Instance"
alarm_actions = ["arn:aws:swf:${data.aws_region.current_region.name}:${data.aws_caller_identity.current_account.account_id}:action/actions/AWS_EC2.InstanceId.Reboot/1.0"]

dimensions {
# coalescelist and list("novalue") were used here due to element not being able to handle empty lists, even if conditional will not allow portion to execute
InstanceId = "${element(coalescelist(aws_instance.mod_ec2_instance_with_secondary_ebs.*.id, aws_instance.mod_ec2_instance_no_secondary_ebs.*.id), count.index)}"
}
alarm_actions = ["arn:aws:swf:${data.aws_region.current_region.name}:${data.aws_caller_identity.current_account.account_id}:action/actions/AWS_EC2.InstanceId.Reboot/1.0"]
}

resource "aws_cloudwatch_metric_alarm" "status_check_failed_system_alarm_recover" {
count = "${var.enable_recovery_alarms ? var.instance_count : 0}"
alarm_name = "${join("-", list("StatusCheckFailedSystemAlarmRecover", var.resource_name, format("%03d",count.index+1)))}"
alarm_description = "Status checks have failed for system, recovering instance"
namespace = "AWS/EC2"
statistic = "Minimum"
alarm_name = "${join("-", list("StatusCheckFailedSystemAlarmRecover", var.resource_name, format("%03d",count.index+1)))}"
comparison_operator = "GreaterThanThreshold"
threshold = "0"
unit = "Count"
dimensions = "${data.null_data_source.alarm_dimensions.*.outputs[count.index]}"
evaluation_periods = "2"
period = "60"
metric_name = "StatusCheckFailed_System"
alarm_actions = ["arn:aws:automate:${data.aws_region.current_region.name}:ec2:recover"]

dimensions {
# coalescelist and list("novalue") were used here due to element not being able to handle empty lists, even if conditional will not allow portion to execute
InstanceId = "${element(coalescelist(aws_instance.mod_ec2_instance_with_secondary_ebs.*.id, aws_instance.mod_ec2_instance_no_secondary_ebs.*.id), count.index)}"
}
}

resource "aws_cloudwatch_metric_alarm" "status_check_failed_instance_alarm_ticket" {
count = "${var.instance_count}"
alarm_name = "${join("-", list("StatusCheckFailedInstanceAlarmTicket", var.resource_name, format("%03d",count.index+1)))}"
alarm_description = "Status checks have failed, generating ticket."
namespace = "AWS/EC2"
period = "60"
statistic = "Minimum"
comparison_operator = "GreaterThanThreshold"
threshold = "0"
unit = "Count"
evaluation_periods = "10"
period = "60"
metric_name = "StatusCheckFailed_Instance"
ok_actions = ["${local.recovery_ok_action[local.recovery_action]}"]
alarm_actions = ["${local.recovery_alarm_action[local.recovery_action]}"]

dimensions {
# coalescelist and list("novalue") were used here due to element not being able to handle empty lists, even if conditional will not allow portion to execute
InstanceId = "${element(coalescelist(aws_instance.mod_ec2_instance_with_secondary_ebs.*.id, aws_instance.mod_ec2_instance_no_secondary_ebs.*.id), count.index)}"
}
alarm_actions = ["arn:aws:automate:${data.aws_region.current_region.name}:ec2:recover"]
}

resource "aws_cloudwatch_metric_alarm" "cpu_alarm_high" {
count = "${var.instance_count}"
alarm_name = "${join("-", list("CPUAlarmHigh", var.resource_name, format("%03d",count.index+1)))}"
alarm_description = "CPU Alarm ${var.cw_cpu_high_operator} ${var.cw_cpu_high_threshold}% for ${var.cw_cpu_high_period} seconds ${var.cw_cpu_high_evaluations} times."
namespace = "AWS/EC2"
statistic = "Average"
comparison_operator = "${var.cw_cpu_high_operator}"
threshold = "${var.cw_cpu_high_threshold}"
evaluation_periods = "${var.cw_cpu_high_evaluations}"
period = "${var.cw_cpu_high_period}"
metric_name = "CPUUtilization"
ok_actions = []
alarm_actions = ["${compact(list(var.alarm_notification_topic))}"]

dimensions {
# coalescelist and list("novalue") were used here due to element not being able to handle empty lists, even if conditional will not allow portion to execute
InstanceId = "${element(coalescelist(aws_instance.mod_ec2_instance_with_secondary_ebs.*.id, aws_instance.mod_ec2_instance_no_secondary_ebs.*.id), count.index)}"
}
module "status_check_failed_instance_alarm_ticket" {
source = "git@github.com:rackspace-infrastructure-automation/aws-terraform-cloudwatch_alarm//?ref=v0.0.1"

alarm_count = "${var.instance_count}"
alarm_description = "Status checks have failed, generating ticket."
alarm_name = "${join("-", list("StatusCheckFailedInstanceAlarmTicket", var.resource_name))}"
comparison_operator = "GreaterThanThreshold"
dimensions = "${data.null_data_source.alarm_dimensions.*.outputs}"
evaluation_periods = "10"
metric_name = "StatusCheckFailed_Instance"
notification_topic = ["${var.notification_topic}"]
namespace = "AWS/EC2"
period = "60"
rackspace_alarms_enabled = true
rackspace_managed = "${var.rackspace_managed}"
severity = "emergency"
statistic = "Minimum"
threshold = "0"
unit = "Count"
}

module "cpu_alarm_high" {
source = "git@github.com:rackspace-infrastructure-automation/aws-terraform-cloudwatch_alarm//?ref=v0.0.1"

alarm_count = "${var.instance_count}"
alarm_description = "CPU Alarm ${var.cw_cpu_high_operator} ${var.cw_cpu_high_threshold}% for ${var.cw_cpu_high_period} seconds ${var.cw_cpu_high_evaluations} times."
alarm_name = "${join("-", list("CPUAlarmHigh", var.resource_name))}"
comparison_operator = "${var.cw_cpu_high_operator}"
customer_alarms_enabled = true
dimensions = "${data.null_data_source.alarm_dimensions.*.outputs}"
evaluation_periods = "${var.cw_cpu_high_evaluations}"
metric_name = "CPUUtilization"
notification_topic = ["${var.notification_topic}"]
namespace = "AWS/EC2"
period = "${var.cw_cpu_high_period}"
rackspace_alarms_enabled = false
rackspace_managed = "${var.rackspace_managed}"
statistic = "Average"
threshold = "${var.cw_cpu_high_threshold}"
}

#
Expand Down
44 changes: 22 additions & 22 deletions tests/test1/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ module "ec2_ar_centos7_with_codedeploy" {
cloudwatch_log_retention = "30"
ssm_association_refresh_rate = "rate(1 day)"
additional_ssm_bootstrap_step_count = "2"
alarm_notification_topic = ""
notification_topic = ""
disable_api_termination = "False"
t2_unlimited_mode = "standard"
creation_policy_timeout = "20m"
Expand Down Expand Up @@ -159,7 +159,7 @@ module "ec2_ar_centos7_no_codedeploy" {
cloudwatch_log_retention = "30"
ssm_association_refresh_rate = "rate(1 day)"
additional_ssm_bootstrap_step_count = "2"
alarm_notification_topic = ""
notification_topic = ""
disable_api_termination = "False"
t2_unlimited_mode = "standard"
creation_policy_timeout = "20m"
Expand Down Expand Up @@ -250,7 +250,7 @@ module "ec2_ar_windows_with_codedeploy" {
cloudwatch_log_retention = "30"
ssm_association_refresh_rate = "rate(1 day)"
additional_ssm_bootstrap_step_count = "2"
alarm_notification_topic = ""
notification_topic = ""
disable_api_termination = "False"
t2_unlimited_mode = "standard"
creation_policy_timeout = "20m"
Expand Down Expand Up @@ -337,7 +337,7 @@ module "ec2_ar_windows_no_codedeploy" {
cloudwatch_log_retention = "30"
ssm_association_refresh_rate = "rate(1 day)"
additional_ssm_bootstrap_step_count = "2"
alarm_notification_topic = ""
notification_topic = ""
disable_api_termination = "False"
t2_unlimited_mode = "standard"
creation_policy_timeout = "20m"
Expand Down Expand Up @@ -393,29 +393,29 @@ module "sns" {
module "unmanaged_ar" {
source = "../../module"

ec2_os = "centos7"
instance_count = "1"
subnets = ["${element(module.vpc.private_subnets, 0)}"]
security_group_list = ["${module.vpc.default_sg}"]
image_id = "${data.aws_ami.amazon_centos_7.image_id}"
instance_type = "t2.micro"
resource_name = "my_unmanaged_instance-${random_string.res_name.result}"
alarm_notification_topic = "${module.sns.topic_arn}"
rackspace_managed = false
ec2_os = "centos7"
instance_count = "1"
subnets = ["${element(module.vpc.private_subnets, 0)}"]
security_group_list = ["${module.vpc.default_sg}"]
image_id = "${data.aws_ami.amazon_centos_7.image_id}"
instance_type = "t2.micro"
resource_name = "my_unmanaged_instance-${random_string.res_name.result}"
notification_topic = "${module.sns.topic_arn}"
rackspace_managed = false
}

module "zero_count_ar" {
source = "../../module"

ec2_os = "centos7"
instance_count = "0"
subnets = []
security_group_list = ["${module.vpc.default_sg}"]
image_id = "${data.aws_ami.amazon_centos_7.image_id}"
instance_type = "t2.micro"
resource_name = "my_nonexistent_instance-${random_string.res_name.result}"
alarm_notification_topic = "${module.sns.topic_arn}"
rackspace_managed = false
ec2_os = "centos7"
instance_count = "0"
subnets = []
security_group_list = ["${module.vpc.default_sg}"]
image_id = "${data.aws_ami.amazon_centos_7.image_id}"
instance_type = "t2.micro"
resource_name = "my_nonexistent_instance-${random_string.res_name.result}"
notification_topic = "${module.sns.topic_arn}"
rackspace_managed = false
}

module "ec2_nfs" {
Expand Down
2 changes: 1 addition & 1 deletion variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ variable "ssm_patching_group" {
# CloudWatch and Logs
#

variable "alarm_notification_topic" {
variable "notification_topic" {
description = "SNS Topic ARN to notify if there are any alarms"
type = "string"
default = ""
Expand Down

0 comments on commit 23ccd75

Please sign in to comment.