diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/config.test.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/config.test.ts index f4c8865879..33760ce47e 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/config.test.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/config.test.ts @@ -24,8 +24,10 @@ describe('Config', () => { process.env.KMS_KEY_ID = 'KMS_KEY_ID'; process.env.LAMBDA_TIMEOUT = '113'; process.env.LAUNCH_TEMPLATE_NAME_LINUX = 'LAUNCH_TEMPLATE_NAME_LINUX'; + process.env.LAUNCH_TEMPLATE_NAME_LINUX_NVIDIA = 'LAUNCH_TEMPLATE_NAME_LINUX_NVIDIA'; process.env.LAUNCH_TEMPLATE_NAME_WINDOWS = 'LAUNCH_TEMPLATE_NAME_WINDOWS'; process.env.LAUNCH_TEMPLATE_VERSION_LINUX = 'LAUNCH_TEMPLATE_VERSION_LINUX'; + process.env.LAUNCH_TEMPLATE_VERSION_LINUX_NVIDIA = 'LAUNCH_TEMPLATE_VERSION_LINUX_NVIDIA'; process.env.LAUNCH_TEMPLATE_VERSION_WINDOWS = 'LAUNCH_TEMPLATE_VERSION_WINDOWS'; process.env.MINIMUM_RUNNING_TIME_IN_MINUTES = '33'; process.env.MIN_AVAILABLE_RUNNERS = '113'; @@ -55,8 +57,10 @@ describe('Config', () => { expect(Config.Instance.kmsKeyId).toBe('KMS_KEY_ID'); expect(Config.Instance.lambdaTimeout).toBe(113); expect(Config.Instance.launchTemplateNameLinux).toBe('LAUNCH_TEMPLATE_NAME_LINUX'); + expect(Config.Instance.launchTemplateNameLinuxNvidia).toBe('LAUNCH_TEMPLATE_NAME_LINUX_NVIDIA'); expect(Config.Instance.launchTemplateNameWindows).toBe('LAUNCH_TEMPLATE_NAME_WINDOWS'); expect(Config.Instance.launchTemplateVersionLinux).toBe('LAUNCH_TEMPLATE_VERSION_LINUX'); + expect(Config.Instance.launchTemplateVersionLinuxNvidia).toBe('LAUNCH_TEMPLATE_VERSION_LINUX_NVIDIA'); expect(Config.Instance.launchTemplateVersionWindows).toBe('LAUNCH_TEMPLATE_VERSION_WINDOWS'); expect(Config.Instance.minAvailableRunners).toBe(113); expect(Config.Instance.minimumRunningTimeInMinutes).toBe(33); @@ -92,8 +96,10 @@ describe('Config', () => { delete process.env.KMS_KEY_ID; delete process.env.LAMBDA_TIMEOUT; process.env.LAUNCH_TEMPLATE_NAME_LINUX = 'LAUNCH_TEMPLATE_NAME_LINUX'; + process.env.LAUNCH_TEMPLATE_NAME_LINUX_NVIDIA = 'LAUNCH_TEMPLATE_NAME_LINUX_NVIDIA'; process.env.LAUNCH_TEMPLATE_NAME_WINDOWS = 'LAUNCH_TEMPLATE_NAME_WINDOWS'; process.env.LAUNCH_TEMPLATE_VERSION_LINUX = 'LAUNCH_TEMPLATE_VERSION_LINUX'; + process.env.LAUNCH_TEMPLATE_VERSION_LINUX_NVIDIA = 'LAUNCH_TEMPLATE_VERSION_LINUX_NVIDIA'; process.env.LAUNCH_TEMPLATE_VERSION_WINDOWS = 'LAUNCH_TEMPLATE_VERSION_WINDOWS'; delete process.env.MIN_AVAILABLE_RUNNERS; delete process.env.MUST_HAVE_ISSUES_LABELS; @@ -120,8 +126,10 @@ describe('Config', () => { expect(Config.Instance.kmsKeyId).toBeUndefined(); expect(Config.Instance.lambdaTimeout).toEqual(600); expect(Config.Instance.launchTemplateNameLinux).toBe('LAUNCH_TEMPLATE_NAME_LINUX'); + expect(Config.Instance.launchTemplateNameLinuxNvidia).toBe('LAUNCH_TEMPLATE_NAME_LINUX_NVIDIA'); expect(Config.Instance.launchTemplateNameWindows).toBe('LAUNCH_TEMPLATE_NAME_WINDOWS'); expect(Config.Instance.launchTemplateVersionLinux).toBe('LAUNCH_TEMPLATE_VERSION_LINUX'); + expect(Config.Instance.launchTemplateVersionLinuxNvidia).toBe('LAUNCH_TEMPLATE_VERSION_LINUX_NVIDIA'); expect(Config.Instance.launchTemplateVersionWindows).toBe('LAUNCH_TEMPLATE_VERSION_WINDOWS'); expect(Config.Instance.minAvailableRunners).toBe(10); expect(Config.Instance.minimumRunningTimeInMinutes).toBe(10); diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/config.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/config.ts index 3dc83df33e..8497609777 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/config.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/config.ts @@ -15,8 +15,10 @@ export class Config { readonly kmsKeyId: string | undefined; readonly lambdaTimeout: number; readonly launchTemplateNameLinux: string | undefined; + readonly launchTemplateNameLinuxNvidia: string | undefined; readonly launchTemplateNameWindows: string | undefined; readonly launchTemplateVersionLinux: string | undefined; + readonly launchTemplateVersionLinuxNvidia: string | undefined; readonly launchTemplateVersionWindows: string | undefined; readonly minAvailableRunners: number; readonly minimumRunningTimeInMinutes: number; @@ -46,8 +48,10 @@ export class Config { this.kmsKeyId = process.env.KMS_KEY_ID; this.lambdaTimeout = Number(process.env.LAMBDA_TIMEOUT || '600'); this.launchTemplateNameLinux = process.env.LAUNCH_TEMPLATE_NAME_LINUX; + this.launchTemplateNameLinuxNvidia = process.env.LAUNCH_TEMPLATE_NAME_LINUX_NVIDIA; this.launchTemplateNameWindows = process.env.LAUNCH_TEMPLATE_NAME_WINDOWS; this.launchTemplateVersionLinux = process.env.LAUNCH_TEMPLATE_VERSION_LINUX; + this.launchTemplateVersionLinuxNvidia = process.env.LAUNCH_TEMPLATE_VERSION_LINUX_NVIDIA; this.launchTemplateVersionWindows = process.env.LAUNCH_TEMPLATE_VERSION_WINDOWS; /* istanbul ignore next */ diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/runners.test.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/runners.test.ts index 63ba7aa6c1..b00b369f27 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/runners.test.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/runners.test.ts @@ -52,8 +52,12 @@ function createExpectedRunInstancesLinux(runnerParameters: RunnerInputParameters MaxCount: 1, MinCount: 1, LaunchTemplate: { - LaunchTemplateName: Config.Instance.launchTemplateNameLinux, - Version: Config.Instance.launchTemplateVersionLinux, + LaunchTemplateName: runnerParameters.runnerType.runnerTypeName.includes('.nvidia.gpu') + ? Config.Instance.launchTemplateNameLinuxNvidia + : Config.Instance.launchTemplateNameLinux, + Version: runnerParameters.runnerType.runnerTypeName.includes('.nvidia.gpu') + ? Config.Instance.launchTemplateVersionLinuxNvidia + : Config.Instance.launchTemplateVersionLinux, }, InstanceType: runnerParameters.runnerType.instance_type, BlockDeviceMappings: [ @@ -345,7 +349,7 @@ describe('create runner', () => { os: 'linux', max_available: 200, disk_size: 100, - runnerTypeName: 'linuxCpu', + runnerTypeName: 'linuxCpu.nvidia.gpu', is_ephemeral: true, }, }; diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/runners.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/runners.ts index ecd5c6acd8..9d6ad26864 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/runners.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/runners.ts @@ -221,6 +221,18 @@ async function addSSMParameterRunnerConfig( console.debug(`Created SSM Parameters(s): ${createdSSMParams.join(',')}`); } +function getLaunchTemplateName(runnerParameters: RunnerInputParameters): Array { + if (runnerParameters.runnerType.os === 'linux') { + if (runnerParameters.runnerType.runnerTypeName.includes('.nvidia.gpu')) { + return [Config.Instance.launchTemplateNameLinuxNvidia, Config.Instance.launchTemplateVersionLinuxNvidia]; + } else { + return [Config.Instance.launchTemplateNameLinux, Config.Instance.launchTemplateVersionLinux]; + } + } else { + return [Config.Instance.launchTemplateNameWindows, Config.Instance.launchTemplateVersionWindows]; + } +} + export async function createRunner(runnerParameters: RunnerInputParameters, metrics: Metrics): Promise { try { console.debug('Runner configuration: ' + JSON.stringify(runnerParameters)); @@ -256,6 +268,8 @@ export async function createRunner(runnerParameters: RunnerInputParameters, metr }); } + const [launchTemplateName, launchTemplateVersion] = getLaunchTemplateName(runnerParameters); + const runInstancesResponse = await expBackOff(() => { return metrics.trackRequest( metrics.ec2RunInstancesAWSCallSuccess, @@ -266,14 +280,8 @@ export async function createRunner(runnerParameters: RunnerInputParameters, metr MaxCount: 1, MinCount: 1, LaunchTemplate: { - LaunchTemplateName: - runnerParameters.runnerType.os === 'linux' - ? Config.Instance.launchTemplateNameLinux - : Config.Instance.launchTemplateNameWindows, - Version: - runnerParameters.runnerType.os === 'linux' - ? Config.Instance.launchTemplateVersionLinux - : Config.Instance.launchTemplateVersionWindows, + LaunchTemplateName: launchTemplateName, + Version: launchTemplateVersion, }, InstanceType: runnerParameters.runnerType.instance_type, BlockDeviceMappings: [ diff --git a/terraform-aws-github-runner/modules/runners/logging.tf b/terraform-aws-github-runner/modules/runners/logging.tf index 1f9f827675..00559ab146 100644 --- a/terraform-aws-github-runner/modules/runners/logging.tf +++ b/terraform-aws-github-runner/modules/runners/logging.tf @@ -77,6 +77,26 @@ resource "aws_ssm_parameter" "cloudwatch_agent_config_runner_linux" { tags = local.tags } +resource "aws_ssm_parameter" "cloudwatch_agent_config_runner_linux_nvidia" { + count = var.enable_cloudwatch_agent ? 1 : 0 + name = "${var.environment}-cloudwatch_agent_config_runner_linux_nvidia" + type = "String" + value = jsonencode( + jsondecode( + templatefile( + "${path.module}/templates/cloudwatch_config.json", + { + aws_region = var.aws_region + environment = var.environment + logfiles = jsonencode(local.logfiles_linux) + metrics_collected = templatefile("${path.module}/templates/cloudwatch_config_linux_nvidia.json", {}) + } + ) + ) + ) + tags = local.tags +} + resource "aws_cloudwatch_log_group" "gh_runners_linux" { count = length(local.loggroups_names_linux) name = local.loggroups_names_linux[count.index] @@ -95,6 +115,17 @@ resource "aws_iam_role_policy" "cloudwatch_linux" { ) } +resource "aws_iam_role_policy" "cloudwatch_linux_nvidia" { + count = var.enable_ssm_on_runners ? 1 : 0 + name = "CloudWatchLogginAndMetricsLinuxNvidia" + role = aws_iam_role.runner.name + policy = templatefile("${path.module}/policies/instance-cloudwatch-policy.json", + { + ssm_parameter_arn = aws_ssm_parameter.cloudwatch_agent_config_runner_linux_nvidia[0].arn + } + ) +} + resource "aws_ssm_parameter" "cloudwatch_agent_config_runner_windows" { count = var.enable_cloudwatch_agent ? 1 : 0 name = "${var.environment}-cloudwatch_agent_config_runner_windows" diff --git a/terraform-aws-github-runner/modules/runners/main.tf b/terraform-aws-github-runner/modules/runners/main.tf index 3da6257884..384eceba3a 100644 --- a/terraform-aws-github-runner/modules/runners/main.tf +++ b/terraform-aws-github-runner/modules/runners/main.tf @@ -102,6 +102,52 @@ resource "aws_launch_template" "linux_runner" { tags = local.tags } +resource "aws_launch_template" "linux_runner_nvidia" { + name = "${var.environment}-action-linux-runner-nvidia" + + iam_instance_profile { + name = aws_iam_instance_profile.runner.name + } + + instance_initiated_shutdown_behavior = "terminate" + + image_id = data.aws_ami.runner_ami_linux.id + instance_type = var.instance_type + key_name = var.key_name + + tag_specifications { + resource_type = "instance" + tags = merge( + local.tags, + { + "Name" = format("%s", local.name_runner) + }, + ) + } + + tag_specifications { + resource_type = "volume" + tags = merge( + local.tags, + { + "Name" = format("%s", local.name_runner) + }, + ) + } + + user_data = base64encode(templatefile(local.userdata_template, { + environment = var.environment + pre_install = var.userdata_pre_install + post_install = var.userdata_post_install + enable_cloudwatch_agent = var.enable_cloudwatch_agent + ssm_key_cloudwatch_agent_config = var.enable_cloudwatch_agent ? aws_ssm_parameter.cloudwatch_agent_config_runner_linux_nvidia[0].name : "" + ghes_url = var.ghes_url + install_config_runner = local.install_config_runner_linux + })) + + tags = local.tags +} + resource "aws_launch_template" "windows_runner" { name = "${var.environment}-action-windows-runner" diff --git a/terraform-aws-github-runner/modules/runners/scale-up.tf b/terraform-aws-github-runner/modules/runners/scale-up.tf index 295cbe993e..2ee3782f25 100644 --- a/terraform-aws-github-runner/modules/runners/scale-up.tf +++ b/terraform-aws-github-runner/modules/runners/scale-up.tf @@ -30,25 +30,27 @@ resource "aws_lambda_function" "scale_up" { environment { variables = { - AWS_REGION_INSTANCES = join(",", var.aws_region_instances) - CANT_HAVE_ISSUES_LABELS = join(",", var.cant_have_issues_labels) - ENABLE_ORGANIZATION_RUNNERS = var.enable_organization_runners - ENVIRONMENT = var.environment - GITHUB_APP_CLIENT_ID = var.github_app.client_id - GITHUB_APP_CLIENT_SECRET = local.github_app_client_secret - GITHUB_APP_ID = var.github_app.id - GITHUB_APP_KEY_BASE64 = local.github_app_key_base64 - KMS_KEY_ID = var.encryption.kms_key_id - LAMBDA_TIMEOUT = var.lambda_timeout_scale_up - LAUNCH_TEMPLATE_NAME_LINUX = aws_launch_template.linux_runner.name - LAUNCH_TEMPLATE_NAME_WINDOWS = aws_launch_template.windows_runner.name - LAUNCH_TEMPLATE_VERSION_LINUX = aws_launch_template.linux_runner.latest_version - LAUNCH_TEMPLATE_VERSION_WINDOWS = aws_launch_template.windows_runner.latest_version - MUST_HAVE_ISSUES_LABELS = join(",", var.must_have_issues_labels) - RUNNER_EXTRA_LABELS = var.runner_extra_labels - SECRETSMANAGER_SECRETS_ID = var.secretsmanager_secrets_id - SECURITY_GROUP_IDS = join(",", concat([aws_security_group.runner_sg.id], var.runner_additional_security_group_ids)) - SUBNET_IDS = join(",", var.subnet_ids) + AWS_REGION_INSTANCES = join(",", var.aws_region_instances) + CANT_HAVE_ISSUES_LABELS = join(",", var.cant_have_issues_labels) + ENABLE_ORGANIZATION_RUNNERS = var.enable_organization_runners + ENVIRONMENT = var.environment + GITHUB_APP_CLIENT_ID = var.github_app.client_id + GITHUB_APP_CLIENT_SECRET = local.github_app_client_secret + GITHUB_APP_ID = var.github_app.id + GITHUB_APP_KEY_BASE64 = local.github_app_key_base64 + KMS_KEY_ID = var.encryption.kms_key_id + LAMBDA_TIMEOUT = var.lambda_timeout_scale_up + LAUNCH_TEMPLATE_NAME_LINUX = aws_launch_template.linux_runner.name + LAUNCH_TEMPLATE_NAME_LINUX_NVIDIA = aws_launch_template.linux_runner_nvidia.name + LAUNCH_TEMPLATE_NAME_WINDOWS = aws_launch_template.windows_runner.name + LAUNCH_TEMPLATE_VERSION_LINUX = aws_launch_template.linux_runner.latest_version + LAUNCH_TEMPLATE_VERSION_LINUX_NVIDIA = aws_launch_template.linux_runner_nvidia.latest_version + LAUNCH_TEMPLATE_VERSION_WINDOWS = aws_launch_template.windows_runner.latest_version + MUST_HAVE_ISSUES_LABELS = join(",", var.must_have_issues_labels) + RUNNER_EXTRA_LABELS = var.runner_extra_labels + SECRETSMANAGER_SECRETS_ID = var.secretsmanager_secrets_id + SECURITY_GROUP_IDS = join(",", concat([aws_security_group.runner_sg.id], var.runner_additional_security_group_ids)) + SUBNET_IDS = join(",", var.subnet_ids) } } @@ -62,7 +64,6 @@ resource "aws_lambda_function" "scale_up" { } resource "aws_lambda_alias" "scale_up_lambda_alias" { - count = var.scale_up_provisioned_concurrent_executions > 0 ? 1 : 0 name = "provisioned-${aws_lambda_function.scale_up.function_name}" description = "Alias for provisioned instances of ${aws_lambda_function.scale_up.function_name}" function_name = aws_lambda_function.scale_up.function_name @@ -73,7 +74,7 @@ resource "aws_lambda_provisioned_concurrency_config" "scale_up_provisioned_concu count = var.scale_up_provisioned_concurrent_executions > 0 ? 1 : 0 function_name = aws_lambda_alias.scale_up_lambda_alias.function_name provisioned_concurrent_executions = var.scale_up_provisioned_concurrent_executions - qualifier = aws_lambda_alias.scale_up_lambda_alias.version + qualifier = aws_lambda_alias.scale_up_lambda_alias.name } resource "aws_cloudwatch_log_group" "scale_up" { diff --git a/terraform-aws-github-runner/modules/runners/templates/cloudwatch_config_linux_nvidia.json b/terraform-aws-github-runner/modules/runners/templates/cloudwatch_config_linux_nvidia.json new file mode 100644 index 0000000000..b1a0bf0dec --- /dev/null +++ b/terraform-aws-github-runner/modules/runners/templates/cloudwatch_config_linux_nvidia.json @@ -0,0 +1,58 @@ +{ + "cpu": { + "measurement": [ + "cpu_usage_idle", + "cpu_usage_iowait", + "cpu_usage_user", + "cpu_usage_system" + ], + "metrics_collection_interval": 10 + }, + "disk": { + "measurement": [ + "free", + "total", + "used", + "used_percent", + "inodes_free", + "inodes_total" + ], + "metrics_collection_interval": 10, + "resources": [ + "*" + ] + }, + "diskio": { + "measurement": [ + "io_time" + ], + "metrics_collection_interval": 10, + "resources": [ + "/" + ] + }, + "mem": { + "measurement": [ + "total", + "used", + "free", + "used_percent" + ], + "metrics_collection_interval": 10 + }, + "swap": { + "measurement": [ + "swap_used_percent" + ], + "metrics_collection_interval": 10 + }, + "nvidia_gpu": { + "measurement": [ + "utilization_gpu", + "utilization_memory", + "memory_total", + "memory_used", + "memory_free" + ] + } +}