In [None]:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue import DynamicFrame



In [None]:
# ================================
#      Glue Job Initialization
# ================================

# Get job name from command-line arguments (Glue uses this internally)
args = getResolvedOptions(sys.argv, ['JOB_NAME'])

# Create Spark and Glue contexts — core of any Glue job
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session

# Initialize and register the Glue job
job = Job(glueContext)
job.init(args['JOB_NAME'], args)



In [None]:
# ===================================
#     Load DataFrames from Glue Catalog
# ===================================

# Each dynamic frame here represents a table in the AWS Glue Data Catalog
# These are the cleaned AdventureWorks tables getting ready to ingest into Redshift

aw_territories_df = glueContext.create_dynamic_frame.from_catalog(
    database="aw_db", table_name="aw_territories", transformation_ctx="aw_territories_df"
)

aw_returns_df = glueContext.create_dynamic_frame.from_catalog(
    database="aw_db", table_name="aw_returns", transformation_ctx="aw_returns_df"
)

aw_sales_df = glueContext.create_dynamic_frame.from_catalog(
    database="aw_db", table_name="aw_sales", transformation_ctx="aw_sales_df"
)

aw_products_df = glueContext.create_dynamic_frame.from_catalog(
    database="aw_db", table_name="aw_products", transformation_ctx="aw_products_df"
)

aw_product_category_df = glueContext.create_dynamic_frame.from_catalog(
    database="aw_db", table_name="aw_product_category", transformation_ctx="aw_product_category_df"
)

aw_customers_df = glueContext.create_dynamic_frame.from_catalog(
    database="aw_db", table_name="aw_customers", transformation_ctx="aw_customers_df"
)

aw_product_subcategories_df = glueContext.create_dynamic_frame.from_catalog(
    database="aw_db", table_name="aw_product_subcategories", transformation_ctx="aw_product_subcategories_df"
)



In [None]:
# ==========================================
#     Write Each Table to Amazon Redshift
# ==========================================

# This block writes each frame to Redshift after optionally creating the table.
# We use `preactions` to ensure the table exists before loading data (DDL-first approach).
# This can be wrapped in a function in future to make it DRY and dynamic.

# Helper function to simplify export
def export_to_redshift(df, table_name, ddl, ctx_name):
    glueContext.write_dynamic_frame.from_options(
        frame=df,
        connection_type="redshift",
        connection_options={
            "redshiftTmpDir": "s3://aws-glue-assets-<account-id>-ap-south-1/temporary/",
            "useConnectionProperties": "true",
            "dbtable": f"public.{table_name}",
            "connectionName": "MyRedshiftConnection",
            "preactions": ddl  # Ensures table is created before data is inserted
        },
        transformation_ctx=ctx_name
    )



In [None]:
# Export each table
export_to_redshift(
    aw_territories_df,
    "aw_territories",
    """CREATE TABLE IF NOT EXISTS public.aw_territories (
        salesterritorykey INTEGER,
        region VARCHAR,
        country VARCHAR,
        continent VARCHAR
    );""",
    "Export_aw_territories"
)

export_to_redshift(
    aw_returns_df,
    "aw_returns",
    """CREATE TABLE IF NOT EXISTS public.aw_returns (
        returndate DATE,
        territorykey INTEGER,
        productkey INTEGER,
        returnquantity INTEGER
    );""",
    "Export_aw_returns"
)

export_to_redshift(
    aw_sales_df,
    "aw_sales",
    """CREATE TABLE IF NOT EXISTS public.aw_sales (
        orderdate DATE,
        stockdate DATE,
        ordernumber VARCHAR,
        productkey INTEGER,
        customerkey INTEGER,
        territorykey INTEGER,
        sales_year INTEGER
    );""",
    "Export_aw_sales"
)

export_to_redshift(
    aw_products_df,
    "aw_products",
    """CREATE TABLE IF NOT EXISTS public.aw_products (
        productkey INTEGER,
        productsubcategorykey INTEGER,
        productsku VARCHAR,
        productname VARCHAR,
        modelname VARCHAR,
        productdescription VARCHAR,
        productcolor VARCHAR,
        productsize VARCHAR,
        productstyle VARCHAR,
        productcost DOUBLE PRECISION,
        productprice DOUBLE PRECISION
    );""",
    "Export_aw_products"
)

export_to_redshift(
    aw_product_category_df,
    "aw_product_category",
    """CREATE TABLE IF NOT EXISTS public.aw_product_category (
        productcategorykey INTEGER,
        categoryname VARCHAR
    );""",
    "Export_aw_product_category"
)

export_to_redshift(
    aw_customers_df,
    "aw_customers",
    """CREATE TABLE IF NOT EXISTS public.aw_customers (
        customerkey INTEGER,
        birthdate DATE,
        maritalstatus VARCHAR,
        gender VARCHAR,
        emailaddress VARCHAR,
        annualincome INTEGER,
        totalchildren INTEGER,
        educationlevel VARCHAR,
        occupation VARCHAR,
        homeowner VARCHAR,
        customername VARCHAR
    );""",
    "Export_aw_customers"
)

export_to_redshift(
    aw_product_subcategories_df,
    "aw_product_subcategories",
    """CREATE TABLE IF NOT EXISTS public.aw_product_subcategories (
        productsubcategorykey INTEGER,
        subcategoryname VARCHAR,
        productcategorykey INTEGER
    );""",
    "Export_aw_product_subcategories"
)

# Finalize and commit the job to signal successful completion
job.commit()
