Skip to content

Commit

Permalink
hotfix: glue, secrets
Browse files Browse the repository at this point in the history
  • Loading branch information
AJ Steers committed Oct 9, 2020
1 parent c290261 commit d3822f1
Show file tree
Hide file tree
Showing 8 changed files with 72 additions and 49 deletions.
1 change: 0 additions & 1 deletion catalog/aws/airflow/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ module "airflow_ecs_task" {
container_num_cores = var.container_num_cores
container_ram_gb = var.container_ram_gb
admin_ports = ["8080"]
app_ports = ["8080"]
always_on = true
use_load_balancer = true
}
2 changes: 1 addition & 1 deletion components/aws/ecs-task/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ resource "aws_ecs_task_definition" "ecs_task" {
}
},
"portMappings": [
${join(",\n", [for p in flatten([coalesce(var.app_ports, []), coalesce(var.admin_ports, [])]) : <<EOF2
${join(",\n", [for p in distinct(flatten([coalesce(var.app_ports, []), coalesce(var.admin_ports, [])])) : <<EOF2
{
"containerPort": ${p},
"hostPort": ${p},
Expand Down
43 changes: 41 additions & 2 deletions components/aws/glue-job/iam.tf
Original file line number Diff line number Diff line change
Expand Up @@ -34,26 +34,65 @@ resource "aws_iam_policy" "glue_job_policy" {
"Version": "2012-10-17",
"Statement": [
{
"Sid": "ListObjectsInBucket",
"Sid": "ListS3ObjectsInBucket",
"Effect": "Allow",
"Action": [
"s3:ListBucket"
],
"Resource": [
"*",
"arn:aws:s3:::${var.s3_script_bucket_name}",
"arn:aws:s3:::${var.s3_source_bucket_name}",
"arn:aws:s3:::${var.s3_destination_bucket_name}"
]
},
{
"Sid": "AllObjectActions",
"Sid": "AllS3ObjectActions",
"Effect": "Allow",
"Action": "s3:*Object",
"Resource": [
"*",
"arn:aws:s3:::${var.s3_script_bucket_name}/*",
"arn:aws:s3:::${var.s3_source_bucket_name}/*",
"arn:aws:s3:::${var.s3_destination_bucket_name}/*"
]
},
{
"Effect": "Allow",
"Action": [
"logs:CreateLogGroup",
"logs:CreateLogStream",
"logs:PutLogEvents",
"logs:AssociateKmsKey"
],
"Resource": [
"arn:aws:logs:*:*:/aws-glue/*"
]
},
{
"Effect": "Allow",
"Action": [
"glue:*",
"s3:GetBucketLocation",
"s3:ListBucket",
"s3:ListAllMyBuckets",
"s3:GetBucketAcl",
"ec2:DescribeVpcEndpoints",
"ec2:DescribeRouteTables",
"ec2:CreateNetworkInterface",
"ec2:DeleteNetworkInterface",
"ec2:DescribeNetworkInterfaces",
"ec2:DescribeSecurityGroups",
"ec2:DescribeSubnets",
"ec2:DescribeVpcAttribute",
"iam:ListRolePolicies",
"iam:GetRole",
"iam:GetRolePolicy",
"cloudwatch:PutMetricData"
],
"Resource": [
"*"
]
}
]
}
Expand Down
18 changes: 16 additions & 2 deletions components/aws/glue-job/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@ resource "aws_glue_job" "glue_job" {
name = "${var.name_prefix}data-transformation"
role_arn = aws_iam_role.glue_job_role.arn
tags = var.resource_tags
glue_version = "1.0"
glue_version = "2.0"
max_capacity = var.with_spark ? null : 1

worker_type = var.with_spark ? "Standard" : null
number_of_workers = var.with_spark ? var.num_workers : null
max_retries = 0

command {
script_location = (
Expand All @@ -22,5 +23,18 @@ resource "aws_glue_job" "glue_job" {
python_version = 3
}

default_arguments = var.default_arguments
default_arguments = merge(
var.default_arguments,
{
"--continuous-log-logGroup" = aws_cloudwatch_log_group.glue_job_log.name
"--enable-continuous-cloudwatch-log" = "true"
"--enable-continuous-log-filter" = "true"
"--enable-metrics" = ""
}
)
}

resource "aws_cloudwatch_log_group" "glue_job_log" {
name = "${var.name_prefix}data-transformation"
retention_in_days = 30
}
2 changes: 1 addition & 1 deletion components/aws/glue-job/outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ output "summary" {
Glue ETL Job Summary:
Job Name: ${aws_glue_job.glue_job.id}
Run Command: aws glue start-job-run --job-name ${aws_glue_job.glue_job.id} --arguments="--S3_DATA_BUCKET=${var.s3_destination_bucket_name}"
Run Command: aws glue start-job-run --job-name ${aws_glue_job.glue_job.id} --arguments="--S3_DATA_BUCKET=${var.s3_destination_bucket_name}" --region ${var.environment.aws_region}
EOF
}
4 changes: 2 additions & 2 deletions components/aws/glue-job/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -87,5 +87,5 @@ For additional information, see:
- https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/glue_job#default_arguments
EOF
type = map(string)
default = null
}
default = {}
}
4 changes: 2 additions & 2 deletions components/aws/secrets-manager/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,8 @@ resource "aws_secretsmanager_secret_version" "secrets_value" {

resource "aws_ssm_parameter" "secrets" {
for_each = var.use_parameter_store == false ? {} : local.new_secrets_map
name = "${var.name_prefix}${random_id.suffix.dec}/${each.key}"
description = "Stored using Terraform"
name = "/${var.name_prefix}${random_id.suffix.dec}/${each.key}"
description = "Created using Terraform"
type = "SecureString"
value = each.value
tags = var.resource_tags
Expand Down
47 changes: 9 additions & 38 deletions samples/kitchen-sink-on-aws/glue/transform/transform.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,9 @@
# PySpark Script for AWS Glue ETL
"""Sample Glue Tranform"""

# setup environment
import sys
from awsglue.context import GlueContext
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.dynamicframe import DynamicFrame
from awsglue.job import Job
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.sql import SQLContext

args = getResolvedOptions(sys.argv, ["S3_DATA_BUCKET"])
Expand All @@ -17,39 +12,15 @@

sc = SparkContext()
sqlContext = SQLContext(sc)
glueContext = GlueContext(SparkContext.getOrCreate())
spark = sqlContext.sparkSession

# create dynamic frame from raw data stored on s3
inputPop = glueContext.create_dynamic_frame.from_options(
connection_type="s3",
connection_options={
"path": "s3a://covid19-lake/static-datasets/csv/CountyPopulation/County_Population.csv"
},
format="csv",
format_options={},
transformation_ctx="",
)

# create dataframe for each table
inputPop_df = inputPop.toDF()

# create table to query from each dataframe
inputPop_df.createOrReplaceTempView("inputPop_table")

# write SQL statements to clean the data
inputPop_clean = spark.sql(
df = spark.read.csv("s3a://noaa-ghcn-pds/csv/2020.csv", inferSchema=True)
df.createOrReplaceTempView("temp_table")
df_clean = spark.sql(
"""
SELECT
id2 as fips,
County as county_name,
State as state_name,
Population Estimate 2018 as est_population
FROM inputPop_table
WHERE upper(state) = 'COLORADO'
SELECT *
FROM temp_table
"""
)

# write out the files to curated folder
inputPop_clean.write.mode("overwrite").parquet(
"s3://{}/CURATED/County_Population_CURATED.csv".format(bucket)
)
df_clean.write.mode("overwrite").csv("s3://{}/out/test_outfile.csv".format(bucket))

0 comments on commit d3822f1

Please sign in to comment.