In [0]:
%run ../01-Config/02-config

In [0]:
# Create the silver layer table(dimension)
query = """CREATE TABLE IF NOT EXISTS ECOMMERCE_DB.SILVER_DB.customers (
    customer_id STRING,
    name STRING,
    email STRING,
    country STRING,
    customer_type STRING,
    registration_date DATE,
    age INT,
    gender STRING,
    total_purchases INT,
    customer_segment STRING,
    days_since_registration INT,
    last_updated TIMESTAMP)"""
spark._jvm.net.snowflake.spark.snowflake.Utils.runQuery(snowflake_config, query)

In [0]:
# Get the last processed timestamp from silver layer
customer_query = "SELECT MAX(last_updated) as last_processed FROM ECOMMERCE_DB.SILVER_DB.CUSTOMERS"
last_processed_df = spark.read.format("snowflake") \
    .options(**snowflake_config) \
    .option("query", customer_query) \
    .load()
# Display the schema to check the column names
last_processed_df.printSchema()

# Check if the DataFrame is empty
if last_processed_df.count() == 0:
    last_processed_timestamp = "1900-01-01T00:00:00.000+00:00"
else:
    last_processed_row = last_processed_df.collect()[0]
    last_processed_timestamp = last_processed_row['LAST_PROCESSED'] if 'LAST_PROCESSED' in last_processed_row else None
    if last_processed_timestamp is None:
        last_processed_timestamp = "1900-01-01T00:00:00.000+00:00"

In [0]:
query = f"""
SELECT *
FROM ECOMMERCE_DB.BRONZE.customer c 
WHERE c.ingestion_timestamp > '{last_processed_timestamp}'
"""
customer_view = spark.read.format("snowflake") \
    .options(**snowflake_config) \
    .option("query", query) \
    .load() 

#customer_view.createOrReplaceTempView("bronze_incremental")


In [0]:
customer_view.createOrReplaceTempView("bronze_incremental")
spark.sql("select * from bronze_incremental").show()

In [0]:
#Validate email addresses (null or not null)
#Valid age between 18 to 100
#Create customer_segment as total_purchases > 10000 THEN 'High Value' if > 5000 THEN 'Medium Value'  ELSE 'Low Value'
#days since user is registered in the system
#Remove any junk records where total_purchase is negative number


In [0]:
spark.sql("""
CREATE OR REPLACE TEMPORARY VIEW silver_incremental AS
SELECT
    customer_id,
    name,
    email,
    country,
    customer_type,
    registration_date,
    age,
    gender,
    total_purchases,
    CASE
        WHEN total_purchases > 10000 THEN 'High Value'
        WHEN total_purchases > 5000 THEN 'Medium Value'
        ELSE 'Low Value'
    END AS customer_segment,
    DATEDIFF(CURRENT_DATE(), registration_date) AS days_since_registration,
    CURRENT_TIMESTAMP() AS last_updated
FROM bronze_incremental
WHERE 
    age BETWEEN 18 AND 100
    AND email IS NOT NULL
    AND total_purchases >= 0
""")

In [0]:
#display(spark.sql("select * from silver_incremental"))

customer_query = "SELECT * FROM ECOMMERCE_DB.SILVER_DB.CUSTOMERS"
target_df = spark.read.format("snowflake") \
    .options(**snowflake_config) \
    .option("query", customer_query) \
    .load()

display(target_df)

source_cust = spark.sql("select * from silver_incremental")
upd_cust = target_df.join(source_cust, "customer_id", "inner").select(source_cust["*"])
new_cust = target_df.join(source_cust, "customer_id", "right").select(source_cust["*"])

display(upd_cust)
display(new_cust)








In [0]:
#wite data staging tables to silver schema in snowflake

new_cust.write \
    .format("snowflake") \
    .options(**snowflake_config) \
    .option("dbtable", 'customers') \
    .option("sfDatabase", 'ecommerce_db') \
    .option("sfSchema", 'silver_db') \
    .mode("append") \
    .save()

upd_cust.write \
    .format("snowflake") \
    .options(**snowflake_config) \
    .option("dbtable", 'customers') \
    .option("sfDatabase", 'ecommerce_db') \
    .option("sfSchema", 'silver_db') \
    .mode("overwrite") \
    .save()


