In [0]:
from delta.tables import *
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, array, ArrayType, DateType, TimestampType, FloatType
from pyspark.sql.functions import *

In [0]:
aws_access_key = "XXXXXXXXXXXXXXXXXXXXX"
encoded_aws_access_key = aws_access_key.replace("/", "%2F")
aws_secret_key = "XXXXXXXXXXXXXXXXXXXXX"
encoded_secret_key = aws_secret_key.replace("/", "%2F")
sc._jsc.hadoopConfiguration().set("fs.s3a.access.key", encoded_aws_access_key)
sc._jsc.hadoopConfiguration().set("fs.s3a.secret.key", encoded_secret_key)

# If you are using Auto Loader file notification mode to load files, provide the AWS Region ID.
aws_region = "us-east-1"
sc._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3." + aws_region + ".amazonaws.com")

In [0]:
S3_BUCKET="aws-analytics-course"
BRONZE_LAYER_NAMESPACE="bronze/dms"
SCRATCH_LAYER_NAMESPACE="temp/delta"
STORE_SALES_FOLDER="sales"
DELTA_TABLE_PATH="s3://" + S3_BUCKET + "/" + SCRATCH_LAYER_NAMESPACE + "/"
print(DELTA_TABLE_PATH)

In [0]:

SALES_ORDERS_PATH="s3://" + S3_BUCKET + "/" + BRONZE_LAYER_NAMESPACE + "/" + STORE_SALES_FOLDER + "/" + "store_orders"
SALES_ORDERS_SCHEMA =[
    ('Op', StringType()),
    ('order_number', IntegerType()),
    ('customer_id', IntegerType()),
    ('product_id', IntegerType()),
    ('order_date', StringType()),    
    ('units', IntegerType()),
    ('sale_price', FloatType()),
    ('currency', StringType()),
    ('order_mode', StringType())
]

fields = [StructField(*field) for field in SALES_ORDERS_SCHEMA]
schema = StructType(fields)
df_read_data_incremental = spark.read                          \
                                .option("header", "true")      \
                                .csv(SALES_ORDERS_PATH + "/" + INCREMENTAL_DATA_FOLDER + "/" + "*.csv", schema=schema)

display(df_read_data_incremental)
df_read_data_incremental.printSchema()

In [0]:
SALES_ORDERS_PATH="s3://" + S3_BUCKET + "/" + BRONZE_LAYER_NAMESPACE + "/" + STORE_SALES_FOLDER + "/" + "store_orders"

df_read_data_incremental = spark.read                             \
                                .option("header", "true")         \
                                .option("inferSchema", "true")    \
                                .csv(SALES_ORDERS_PATH + "/" + INCREMENTAL_DATA_FOLDER + "/" + "*.csv")

display(df_read_data_incremental)
df_read_data_incremental.printSchema()

In [0]:
%sql
DROP TABLE IF EXISTS store_orders

In [0]:
df_read_data_incremental = df_read_data_incremental.withColumn("order_date", to_date(df_read_data_incremental.order_date,  'MM/dd/yyyy'))
df_read_data_incremental = df_read_data_incremental.withColumn("updated_at", to_timestamp(df_read_data_incremental.updated_at,  'yyyy-MM-dd HH:mm:ss'))
display(df_read_data_incremental)
df_read_data_incremental.printSchema()

In [0]:
def merge_to_delta(SALES_ORDERS_PATH, DELTA_TABLE_PATH, INCREMENTAL_DATA_FOLDER):
    try:
        deltaTable = DeltaTable.forPath(spark, DELTA_TABLE_PATH + "/" + "store_orders")
        if deltaTable:
            print("Delta table exists")
            df_read_data_incremental = spark.read                             \
                                            .option("header", "true")         \
                                            .option("inferSchema", "true")    \
                                            .csv(SALES_ORDERS_PATH + "/" + INCREMENTAL_DATA_FOLDER + "/" + "*.csv")
            df_read_data_incremental = df_read_data_incremental.withColumn("order_date", to_date(df_read_data_incremental.order_date,  'MM/dd/yyyy'))
            df_read_data_incremental = df_read_data_incremental.withColumn("updated_at", to_timestamp(df_read_data_incremental.updated_at,  'yyyy-MM-dd HH:mm:ss'))
            display(df_read_data_incremental)
            deltaTable.alias("store_orders").merge(
            df_read_data_incremental.alias("store_orders_incremental"),
                    "store_orders.order_number = store_orders_incremental.order_number")                     \
                    .whenMatchedUpdate(set = {"Op":         "store_orders_incremental.Op",                   \
                                              "order_number":     "store_orders_incremental.order_number",   \
                                              "customer_id":      "store_orders_incremental.customer_id",    \
                                              "product_id":       "store_orders_incremental.product_id",     \
                                              "order_date":       "store_orders_incremental.order_date",     \
                                              "units":            "store_orders_incremental.units",          \
                                              "sale_price":       "store_orders_incremental.sale_price",     \
                                              "currency":         "store_orders_incremental.currency",       \
                                              "order_mode":       "store_orders_incremental.order_mode",     \
                                              "updated_at":       "store_orders_incremental.updated_at"} )   \
                    .whenNotMatchedInsert(values =                                                           \
                       {                                                    
                                              "Op":         "store_orders_incremental.Op",                   \
                                              "order_number":   "store_orders_incremental.order_number",     \
                                              "customer_id":      "store_orders_incremental.customer_id",    \
                                              "product_id":       "store_orders_incremental.product_id",     \
                                              "order_date":       "store_orders_incremental.order_date",     \
                                              "units":            "store_orders_incremental.units",          \
                                              "sale_price":       "store_orders_incremental.sale_price",     \
                                              "currency":         "store_orders_incremental.currency",       \
                                              "order_mode":       "store_orders_incremental.order_mode",     \
                                              "updated_at":       "store_orders_incremental.updated_at"      \
                       }                                                                                     \
                     ).execute()
    except:
        print("Delta table does not exist")
        df_read_data_full = spark.read                          \
                                 .option("header", "true")      \
                                 .option("inferSchema", "true") \
                                 .csv(SALES_ORDERS_PATH + "/" + "LOAD00000001.csv",schema=schema)
        
        df_read_data_full = df_read_data_full.withColumn("order_date", to_date(df_read_data_full.order_date,  'MM/dd/yyyy'))
        df_read_data_full = df_read_data_full.withColumn("updated_at", lit(current_timestamp()))
        PARTITION_COLUMN="currency"
        df_read_data_full.write.format("delta").option("path", DELTA_TABLE_PATH + "/" + "store_orders").partitionBy(PARTITION_COLUMN).saveAsTable("store_orders")
        display(df_read_data_full)

In [0]:
INCREMENTAL_DATA_FOLDER="2023/01/12/18"
merge_to_delta(SALES_ORDERS_PATH, DELTA_TABLE_PATH, INCREMENTAL_DATA_FOLDER)

In [0]:
display(dbutils.fs.ls(DELTA_TABLE_PATH + "/" + "store_orders"))

In [0]:
spark.read.format("delta").load(DELTA_TABLE_PATH + "/" + "store_orders").show(5)

In [0]:
%sql		
SELECT * FROM store_orders;

In [0]:
%sql
DESCRIBE store_orders;

In [0]:
%sql
DESCRIBE HISTORY store_orders;

In [0]:
%sql
SELECT * FROM store_orders WHERE order_number=5; 

In [0]:
%sql
UPDATE store_orders SET sale_price=90.50 WHERE order_number=5;

In [0]:
%sql
SELECT * FROM store_orders WHERE order_number=5;


In [0]:
%sql
DESCRIBE HISTORY store_orders;


In [0]:
%sql
SELECT * FROM store_orders VERSION AS OF 0 WHERE order_number=5;

In [0]:
%sql
DELETE FROM store_orders WHERE order_number=5;
SELECT * FROM store_orders WHERE order_number=5;


In [0]:
%sql
DESCRIBE HISTORY store_orders;

In [0]:
%sql
RESTORE TABLE store_orders TO VERSION AS OF 1;
SELECT * FROM store_orders WHERE order_number=5; 

In [0]:
%sql
DESCRIBE HISTORY store_orders;


In [0]:
%sql
SELECT count(*) FROM store_orders;

In [0]:
INCREMENTAL_DATA_FOLDER="2023/01/12/18"
merge_to_delta(SALES_ORDERS_PATH, DELTA_TABLE_PATH, INCREMENTAL_DATA_FOLDER)


In [0]:
%sql
SELECT count(*) FROM store_orders;
SELECT * FROM store_orders WHERE order_number IN (500, 1254, 1501, 2234, 2345);


In [0]:
%sql
DESCRIBE HISTORY store_orders;


In [0]:
INCREMENTAL_DATA_FOLDER="2023/01/12/19"
merge_to_delta(SALES_ORDERS_PATH, DELTA_TABLE_PATH, INCREMENTAL_DATA_FOLDER)

In [0]:
%sql
SELECT * FROM store_orders WHERE order_number IN (500, 1254, 1501, 2234, 2345);

In [0]:
%sql
DESCRIBE HISTORY store_orders;

In [0]:
%sql
SELECT version, operation, isolationLevel
 from (DESCRIBE HISTORY store_orders);

In [0]:
%sql
ALTER TABLE store_orders SET TBLPROPERTIES ('delta.isolationLevel' = 'Serializable')

In [0]:
%sql
UPDATE store_orders SET sale_price=100.00 WHERE order_number=500;

In [0]:
%sql
SELECT version, operation, isolationLevel
 from (DESCRIBE HISTORY store_orders);


In [0]:
FILE_PATH_WITH_NEW_SCHEMA="s3://aws-analytics-course/temp/schema_change.csv"

df_read_data_schema_change = spark.read                           \
                                .option("header", "true")         \
                                .option("inferSchema", "true")    \
                                .csv(FILE_PATH_WITH_NEW_SCHEMA)
df_read_data_schema_change = df_read_data_schema_change.withColumn("order_date", to_date(df_read_data_schema_change.order_date,  'MM/dd/yyyy'))
#df_read_data_schema_change = df_read_data_schema_change.withColumn("updated_at", to_timestamp(df_read_data_schema_change.updated_at,  'yyyy-MM-dd HH:mm:ss'))
df_read_data_schema_change = df_read_data_schema_change.withColumn("sale_price",df_read_data_schema_change.sale_price.cast(FloatType()))
display(df_read_data_schema_change)
df_read_data_schema_change.printSchema()

In [0]:
deltaTable = DeltaTable.forPath(spark, DELTA_TABLE_PATH + "/" + "store_orders")

In [0]:
df_read_data_schema_change.write.format("delta").mode("append").save(DELTA_TABLE_PATH + "/" + "store_orders")

In [0]:
df_read_data_schema_change.write.format("delta").mode("append").option("mergeSchema", "true").save(DELTA_TABLE_PATH + "/" + "store_orders")


In [0]:
%sql
SELECT * FROM store_orders;