In [0]:
%sql
select current_catalog(),current_database()

current_catalog(),current_database()
spark_catalog,demo


In [0]:
%sql
drop schema if exists demo cascade;
create schema if not exists demo;
use schema demo;

In [0]:

from pyspark.sql.functions import col

dbfsList = dbutils.fs.ls("dbfs:/FileStore/tables/")
df = spark.createDataFrame(dbfsList).toDF("path", "name", "size","modificationTime")
filtered_df = df.filter(col("name").endswith(".parquet"))

display(filtered_df)

path,name,size,modificationTime
dbfs:/FileStore/tables/media_customer_reviews.parquet,media_customer_reviews.parquet,46004,1747220838000
dbfs:/FileStore/tables/media_gold_reviews_chunked.parquet,media_gold_reviews_chunked.parquet,23557,1747220838000
dbfs:/FileStore/tables/sales_customers.parquet,sales_customers.parquet,28493,1747220838000
dbfs:/FileStore/tables/sales_franchises.parquet,sales_franchises.parquet,5905,1747220839000
dbfs:/FileStore/tables/sales_suppliers.parquet,sales_suppliers.parquet,4591,1747220839000
dbfs:/FileStore/tables/sales_transactions.parquet,sales_transactions.parquet,86578,1747220839000


In [0]:
#create sales_customer
sales_customers = spark.read.format("parquet").load("dbfs:/FileStore/tables/sales_customers.parquet")
sales_customers.write.mode("append").saveAsTable("sales_customers")

#create sales_franchises
sales_franchises = spark.read.format("parquet").load("dbfs:/FileStore/tables/sales_franchises.parquet")
sales_franchises.write.mode("append").saveAsTable("sales_franchises")

#create sales_suppliers
sales_suppliers = spark.read.format("parquet").load("dbfs:/FileStore/tables/sales_suppliers.parquet")
sales_suppliers.write.mode("append").saveAsTable("sales_suppliers")

#create sales_transactions
sales_transactions = spark.read.format("parquet").load("dbfs:/FileStore/tables/sales_transactions.parquet")
sales_transactions.write.mode("append").saveAsTable("sales_transactions")

#create media_customer_reviews
media_customer_reviews = spark.read.format("parquet").load("dbfs:/FileStore/tables/media_customer_reviews.parquet")
media_customer_reviews.write.mode("append").saveAsTable("media_customer_reviews")

#create media_gold_reviews_chunked
media_gold_reviews_chunked = spark.read.format("parquet").load("dbfs:/FileStore/tables/media_gold_reviews_chunked.parquet")
media_gold_reviews_chunked.write.mode("append").saveAsTable("media_gold_reviews_chunked")

Create Silver Tables

In [0]:
df_media_customer_reviews = spark.read.table("media_customer_reviews")
df_media_customer_reviews.write.mode("overwrite").saveAsTable("silver_media_customer_reviews")


df_media_gold_reviews_chunked = spark.read.table("media_gold_reviews_chunked")
df_media_gold_reviews_chunked.write.mode("overwrite").saveAsTable("silver_media_gold_reviews_chunked")

df_sales_customers = spark.read.table("sales_customers")
df_sales_customers.write.mode("overwrite").saveAsTable("silver_sales_customers")

df_sales_franchises = spark.read.table("sales_franchises")
df_sales_franchises.write.mode("overwrite").saveAsTable("silver_sales_franchises")

df_sales_suppliers = spark.read.table("sales_suppliers")
df_sales_suppliers.write.mode("overwrite").saveAsTable("silver_sales_suppliers")

df_sales_transactions = spark.read.table("sales_transactions")
df_sales_transactions.write.mode("overwrite").saveAsTable("silver_sales_transactions")

create gold layer tables

In [0]:
sql_query = """
select product, count(transactionID) 
as no_products_sold 
from silver_sales_transactions group by product order by no_products_sold desc limit 1"""

df_most_sold_products = spark.sql(sql_query)

display(df_most_sold_products)


df_most_sold_products.write.mode("overwrite").saveAsTable("gold_most_sold_products")

product,no_products_sold
Golden Gate Ginger,586


In [0]:
sql_query_most_supplied_supplierID = """select s.supplierID,s.name, count(f.franchiseID) as count_franchises
from silver_sales_suppliers s 
left join silver_sales_franchises f 
on s.supplierID = f.supplierID
group by s.supplierID,s.name
order by count_franchises desc"""

df_most_supplied_supplierID = spark.sql(sql_query_most_supplied_supplierID)

display(df_most_supplied_supplierID)

df_most_supplied_supplierID.write.mode("overwrite").saveAsTable("gold_most_supplied_supplierID")

supplierID,name,count_franchises
4000009,Maple Monarch,1
4000026,Mace Meadows,1
4000004,Vanilla Valley,1
4000024,Anise Acres,1
4000023,Fennel Fields,1
4000021,Cocoa Crops,1
4000014,Molasses Mills,1
4000025,Nutmeg Nirvana,1
4000022,Poppy Peaks,1
4000001,Coconut Grove,1


In [0]:
sql_query_total_sales = """
select DATE_FORMAT(dateTime, "MM") as Month_data, sum(quantity) as total_quantity, sum(totalPrice) as total_sales_amount
from silver_sales_transactions
group by Month_data"""

df_total_sales = spark.sql(sql_query_total_sales)
display(df_total_sales)

df_total_sales.write.mode("overwrite").saveAsTable("gold_total_sales")

Month_data,total_quantity,total_sales_amount
5,22157,66471
