In [None]:
# Setup Spark and Glue configurations

%glue_version 3.0
%spark_conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog
%spark_conf spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension
%number_of_workers 2

%%configure
{
  "--datalake-formats": "delta"
}


In [None]:
# Setup Python and Spark libraries
import sys
from awsglue.transforms import *
from pyspark.sql.functions import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job

from delta.tables import *
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, array, ArrayType, DateType, TimestampType, FloatType

sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

In [None]:
# Send arguments to the job

sys.argv+=["--S3_BUCKET", "aws-analytics-course"]
sys.argv+=["--BRONZE_LAYER_NAMESPACE", "bronze/dms"]
sys.argv+=["--SCRATCH_LAYER_NAMESPACE", "temp/delta"]
sys.argv+=["--STORE_SALES_FOLDER", "sales"]
sys.argv+=["--INCREMENTAL_DATA_FOLDER", "2023/01/12/19"]

In [None]:
# Read job arguments

args = getResolvedOptions(sys.argv,["S3_BUCKET", "BRONZE_LAYER_NAMESPACE", "SCRATCH_LAYER_NAMESPACE", "STORE_SALES_FOLDER", "INCREMENTAL_DATA_FOLDER"])
DELTA_TABLE_PATH="s3://" + args['S3_BUCKET'] + "/" + args['SCRATCH_LAYER_NAMESPACE'] + "/"
print(DELTA_TABLE_PATH)

In [None]:
# Option 1 - Read file into a Dataframe by declaring a schema for the sales_order files

SALES_ORDERS_PATH="s3://" + args['S3_BUCKET'] + "/" + args['BRONZE_LAYER_NAMESPACE'] + "/" + args['STORE_SALES_FOLDER'] + "/" + "store_orders"
SALES_ORDERS_SCHEMA =[
    ('Op', StringType()),
    ('order_number', IntegerType()),
    ('customer_id', IntegerType()),
    ('product_id', IntegerType()),
    ('order_date', StringType()),    
    ('units', IntegerType()),
    ('sale_price', FloatType()),
    ('currency', StringType()),
    ('order_mode', StringType())
]

fields = [StructField(*field) for field in SALES_ORDERS_SCHEMA]
schema = StructType(fields)
df_read_data_incremental = spark.read                          \
                                .option("header", "true")      \
                                .csv(SALES_ORDERS_PATH + "/" + args['INCREMENTAL_DATA_FOLDER'] + "/" + "*.csv",schema=schema)

df_read_data_incremental.show(3)
df_read_data_incremental.printSchema()

In [None]:
# Option 2 - Read file into a Dataframe by having Spark infer the schema for the sales_order files

SALES_ORDERS_PATH="s3://" + args['S3_BUCKET'] + "/" + args['BRONZE_LAYER_NAMESPACE'] + "/" + args['STORE_SALES_FOLDER'] + "/" + "store_orders"

df_read_data_incremental = spark.read                             \
                                .option("header", "true")         \
                                .option("inferSchema", "true")    \
                                .csv(SALES_ORDERS_PATH + "/" + args['INCREMENTAL_DATA_FOLDER'] + "/" + "*.csv")

df_read_data_incremental.show(3)
df_read_data_incremental.printSchema()


In [None]:
# Change the column types - convert order_date from string to date, updated_at from string to timestamp

df_read_data_incremental = df_read_data_incremental.withColumn("order_date", to_date(df_read_data_incremental.order_date,  'MM/dd/yyyy'))
df_read_data_incremental = df_read_data_incremental.withColumn("updated_at", to_timestamp(df_read_data_incremental.updated_at,  'yyyy-MM-dd HH:mm:ss'))
df_read_data_incremental.show(3)
df_read_data_incremental.printSchema()

In [None]:
# Use Delta framework to either create a delta table or merge data into an existing table

try:
    deltaTable = DeltaTable.forPath(spark, DELTA_TABLE_PATH + "/" + "store_orders")
    if deltaTable:
        print("Delta table exists")
        df_read_data_incremental = spark.read                             \
                                        .option("header", "true")         \
                                        .option("inferSchema", "true")    \
                                        .csv(SALES_ORDERS_PATH + "/" + args['INCREMENTAL_DATA_FOLDER'] + "/" + "*.csv")
        df_read_data_incremental = df_read_data_incremental.withColumn("order_date", to_date(df_read_data_incremental.order_date,  'MM/dd/yyyy'))
        df_read_data_incremental = df_read_data_incremental.withColumn("updated_at", to_timestamp(df_read_data_incremental.updated_at,  'yyyy-MM-dd HH:mm:ss'))
        df_read_data_incremental.show(10)
        deltaTable.alias("store_orders").merge(
        df_read_data_incremental.alias("store_orders_incremental"),
                "store_orders.order_number = store_orders_incremental.order_number")                     \
                .whenMatchedUpdate(set = {"Op":         "store_orders_incremental.Op",                   \
                                          "order_number":     "store_orders_incremental.order_number",   \
                                          "customer_id":      "store_orders_incremental.customer_id",    \
                                          "product_id":       "store_orders_incremental.product_id",     \
                                          "order_date":       "store_orders_incremental.order_date",     \
                                          "units":            "store_orders_incremental.units",          \
                                          "sale_price":       "store_orders_incremental.sale_price",     \
                                          "currency":         "store_orders_incremental.currency",       \
                                          "order_mode":       "store_orders_incremental.order_mode",     \
                                          "updated_at":       "store_orders_incremental.updated_at"} )   \
                .whenNotMatchedInsert(values =                                                           \
                   {                                                    
                                          "Op":         "store_orders_incremental.Op",                   \
                                          "order_number":   "store_orders_incremental.order_number",     \
                                          "customer_id":      "store_orders_incremental.customer_id",    \
                                          "product_id":       "store_orders_incremental.product_id",     \
                                          "order_date":       "store_orders_incremental.order_date",     \
                                          "units":            "store_orders_incremental.units",          \
                                          "sale_price":       "store_orders_incremental.sale_price",     \
                                          "currency":         "store_orders_incremental.currency",       \
                                          "order_mode":       "store_orders_incremental.order_mode",     \
                                          "updated_at":       "store_orders_incremental.updated_at"      \
                   }                                                                                     \
                 ).execute()
except:
    print("Delta table does not exist")
    df_read_data_full = spark.read                          \
                             .option("header", "true")      \
                             .option("inferSchema", "true") \
                             .csv(SALES_ORDERS_PATH + "/" + "LOAD00000001.csv",schema=schema)
    
    df_read_data_full = df_read_data_full.withColumn("order_date", to_date(df_read_data_full.order_date,  'MM/dd/yyyy'))
    df_read_data_full = df_read_data_full.withColumn("updated_at", lit(current_timestamp()))
    df_read_data_full.write.format("delta").save(DELTA_TABLE_PATH + "/" + "store_orders")
    df_read_data_full.show(5)

In [None]:
dfread=spark.read.format("delta").load(DELTA_TABLE_PATH + "/" + "store_orders")
dfread.createOrReplaceTempView( "store_orders")
print(dfread.count())

In [None]:
%%sql
SELECT * FROM store_orders LIMIT 5;

In [None]:
%%sql
SELECT * FROM store_orders WHERE order_number = 2345;

In [None]:
# Show history of the table

deltaTable = DeltaTable.forPath(spark, DELTA_TABLE_PATH + "/" + "store_orders")
deltaTable.history().show(5)

In [None]:
# Go back and execute cell title "Use Delta framework to either create a delta table or merge data into an existing table"

In [None]:
dfread=spark.read.format("delta").load(DELTA_TABLE_PATH + "/" + "store_orders")
dfread.createOrReplaceTempView( "store_orders")
print(dfread.count())
deltaTable = DeltaTable.forPath(spark, DELTA_TABLE_PATH + "/" + "store_orders")
deltaTable.history().show(5)

In [None]:
%%sql
SELECT * FROM store_orders WHERE order_number = 2601;

In [None]:
# In cell titled "# Send arguments to the job" change sys.argv+=["--INCREMENTAL_DATA_FOLDER", "2023/01/12/18"] to sys.argv+=["--INCREMENTAL_DATA_FOLDER", "2023/01/12/19"]
# Run cell titled "Send arguments to the job" and "Read job arguments"
# Go back and execute cell title "Use Delta framework to either create a delta table or merge data into an existing table"

In [None]:
dfreadprev=spark.read.format("delta").option("versionAsOf", "0").load(DELTA_TABLE_PATH + "/" + "store_orders")
dfread.createOrReplaceTempView( "store_orders_1")

In [None]:
%%sql
SELECT * FROM store_orders_1 WHERE order_number = 2345;

In [None]:

deltaTable.generate("symlink_format_manifest")

In [None]:
spark.sql("CREATE DATABASE IF NOT EXISTS delta")

In [17]:
SALES_ORDERS_SQL = "CREATE EXTERNAL TABLE IF NOT EXISTS store_orders (dms_mode string,     \
                                                                      order_number int,    \
                                                                      customer_id int,     \
                                                                      product_id int,      \
                                                                      order_date string,   \
                                                                      units int,           \
                                                                      sale_price float,    \
                                                                      currency string,     \
                                                                      order_mode string)   \
                                                                      ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'  \
                                                                      STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat' \
                                                                      OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'   \
                                                                      LOCATION '" + DELTA_TABLE_PATH + "/" + "store_orders" + "/" + "_symlink_format_manifest'"
#print(SALES_ORDERS_SQL)





In [18]:
spark.sql("USE delta")
spark.sql("DROP TABLE IF EXISTS store_orders")
spark.sql(SALES_ORDERS_SQL)

DataFrame[]


In [20]:
dfread=spark.read.format("delta").load(DELTA_TABLE_PATH + "/" + "store_orders").show(10)

+--------+------------+-----------+----------+----------+-----+----------+--------+----------+
|dms_mode|order_number|customer_id|product_id|order_date|units|sale_price|currency|order_mode|
+--------+------------+-----------+----------+----------+-----+----------+--------+----------+
|       I|           1|        212|         5|02/03/2019|   10|      11.6|     USD|       NEW|
|       I|           2|       1940|        10|06/24/2020|    8|     72.31|     USD|       NEW|
|       I|           3|         60|         6|02/11/2019|    4|     24.82|     INR|       NEW|
|       I|           4|       2776|         6|05/20/2018|    4|     20.91|     USD|       NEW|
|       I|           5|        409|         9|07/05/2019|    5|     98.41|     INR|       NEW|
|       I|           6|        978|         6|12/16/2020|    1|       6.9|     USD|       NEW|
|       I|           7|       2904|         6|01/04/2021|    1|     71.56|     EUR|       NEW|
|       I|           8|       1269|         3|08/1