In [0]:
#defining silver path to load the data

silver_path = "/mnt/Prajwal/medlian_arch/Silver"

#defining the gold customer path, product path, dates path and the fact_sales path to store the data

gold_customer_path = "/mnt/Prajwal/medlian_arch/Gold/customer" 
gold_product_path = "/mnt/Prajwal/medlian_arch/Gold/product"
gold_dates_path = "/mnt/Prajwal/medlian_arch/Gold/dates"
gold_fact_sales_path = "/mnt/Prajwal/medlian_arch/Gold/fact_sales"

In [0]:
%sql

drop table Dim_Customer;

drop table Dim_Date;

drop table Dim_Product;

drop table Fact_Sales;

In [0]:
#read silver path
df = spark.read.format("delta").load(silver_path)
display(df)

In [0]:
%sql
-- creating table for Customer dimension
create table if not exists Dim_Customer(CustomerID int, CustomerName string, CustomerEmail string) 
USING DELTA
LOCATION '/mnt/Prajwal/medlian_arch/Gold/customer'


In [0]:
%sql
-- creating table for Product dimension
create table if not exists Dim_Product(Product string) 
USING DELTA
LOCATION '/mnt/Prajwal/medlian_arch/Gold/product'

In [0]:
%sql
--creating table for Date dimension
create table if not exists Dim_Date(Date date, Year int, Month int, Day int, FormattedDate string )
USING DELTA
LOCATION '/mnt/Prajwal/medlian_arch/Gold/date'

In [0]:
%sql
--creating external delta dimension table for fact
create table if not exists Fact_Sales(OrderID string, CustomerID string, CustomerName string, CustomerEmail string,
Product string, Quantity int, Price double, Tax double, TotalAmount double, OrderDate date, InsertedAt timestamp)
USING DELTA
LOCATION '/mnt/Prajwal/medlian_arch/Gold/fact_sales'

In [0]:
#reading the silver data
df = spark.read.format("delta").load(silver_path)
df.printSchema()
df.display()

In [0]:
#create gold customer table (unique customer id)
customer = df.select("CustomerID","CustomerName","CustomerEmail").distinct()
customer.createOrReplaceTempView("stg_dim_customer")
customer.show()

In [0]:
#create gold Product table (unique Product)
product_df = df.select("Product").distinct()
product_df.createGlobalTempView("stgg_dim_product")
product_df.show()

In [0]:
#create gold date table
from pyspark.sql.functions import *
from pyspark.sql.types import *



gold_dates_df = df.select(col("OrderDate").alias("Date"),
                          year("OrderDate").alias("Year"),
    month("OrderDate").alias("Month"),
    dayofmonth("OrderDate").alias("Day"),
    date_format("OrderDate", "dd-MMM-yy").alias("FormattedDate")).distinct()

gold_dates_df.show()

In [0]:
gold_dates_df.createOrReplaceTempView("stg_dim_dates")

In [0]:
%sql
select * from stg_dim_dates;

In [0]:
print(type(gold_dates_df))
print(type(customer))
print(type(product_df))

In [0]:
# Creating gold fact table (sales transaction)
from pyspark.sql.functions import col

gold_fact_table = (
    df.join(customer.alias('cust'), "CustomerID").join(product_df.alias('prod'), "Product")
      .join(gold_dates_df.alias('date'), col("OrderDate") == col("date.Date"))
      .select(
          col("OrderID"),
          col("cust.CustomerID"),
          col("cust.CustomerName"),
          col("cust.CustomerEmail"),
          col("prod.Product").alias('ProductName'),
          col("Quantity").cast('int'),
          col("Price").cast('double'),
          col("Tax").cast('double'),
          (col("Quantity") * col("Price")).alias('TotalAmount'),
          col("date.FormattedDate").alias('OrderDate'),
          col("InsertedAt")
      )
)

display(gold_fact_table)

In [0]:
gold_fact_table = gold_fact_table.withColumn("OrderDate", when(to_date(gold_fact_table.OrderDate, "dd-MMM-yy").isNull(), lit(None)).otherwise(to_date(gold_fact_table.OrderDate, "dd-MMM-yy")))

In [0]:
gold_fact_table.createOrReplaceTempView("stg_fact_sales")
display(gold_fact_table)

In [0]:
%sql

merge into dim_customer as target
using stg_dim_customer as source
on target.CustomerEmail = source.CustomerEmail
when matched then 
  update set 
    target.CustomerName = source.CustomerName,
    target.CustomerID = source.CustomerID
when not matched then 
  insert (CustomerID, CustomerName, CustomerEmail)
  values (source.CustomerID, source.CustomerName, source.CustomerEmail)

In [0]:
%sql
select * from Dim_Product limit 5

In [0]:
%sql
-- Loading into Date dimension


MERGE INTO Dim_Date AS target
USING stg_dim_dates AS source
ON target.Date = source.Date
WHEN MATCHED THEN
    UPDATE SET
        target.Year = source.Year,
        target.Month = source.Month,
        target.Day = source.Day,
        target.FormattedDate = source.FormattedDate
WHEN NOT MATCHED THEN
    INSERT (Date, Year, Month, Day, FormattedDate)
    VALUES (source.Date, source.Year, source.Month, source.Day, source.FormattedDate)

In [0]:
%sql
-- Loading into Sales Fact
-- Preprocess the source table to eliminate duplicates
WITH deduped_source AS (
    SELECT
        *,
        ROW_NUMBER() OVER (PARTITION BY OrderID ORDER BY InsertedAt DESC) AS row_num
    FROM
        stg_fact_sales
)
-- Perform the merge operation
MERGE INTO Fact_Sales AS target
USING (
    SELECT * FROM deduped_source WHERE row_num = 1
) AS source
ON target.OrderID = source.OrderID
WHEN MATCHED THEN
    UPDATE SET
        target.CustomerID = source.CustomerID,
        target.CustomerName = source.CustomerName,
        target.CustomerEmail = source.CustomerEmail,
        target.Product = source.ProductName,
        target.Quantity = source.Quantity,
        target.Price = source.Price,
        target.Tax = source.Tax,
        target.TotalAmount = source.TotalAmount,
        target.OrderDate = source.OrderDate,
        target.InsertedAt = source.InsertedAt
WHEN NOT MATCHED THEN
    INSERT (OrderID, CustomerID, CustomerName, CustomerEmail, Product, Quantity, Price, Tax, TotalAmount, OrderDate, InsertedAt)
    VALUES (source.OrderID, source.CustomerID, source.CustomerName, source.CustomerEmail, source.ProductName, source.Quantity, source.Price, source.Tax, source.TotalAmount, source.OrderDate, source.InsertedAt)