In [14]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, to_timestamp, year, month, day, hour, minute
from pyspark.sql.types import StringType, DoubleType, IntegerType
from pyspark.sql.functions import col, monotonically_increasing_id

In [15]:
spark = SparkSession.builder.master("local[*]") \
    .config("spark.executor.instances", "5") \
    .config("spark.sql.legacy.timeParserPolicy", "LEGACY").getOrCreate()

In [16]:
df_bronze = spark.read.parquet("spark-warehouse/bronze/supermarket_sales")


In [17]:
df_bronze.show(5)

+-----------+------+---------+-------------+------+-------------------+----------+--------+-------------+-------+-----------+-----+-----------------------+------------+------+----+-----+---+----+------+
| invoice_id|branch|     city|customer_type|gender|       product_line|unit_price|quantity|tax_5_percent|  total|    payment| cogs|gross_margin_percentage|gross_income|rating|year|month|day|hour|minute|
+-----------+------+---------+-------------+------+-------------------+----------+--------+-------------+-------+-----------+-----+-----------------------+------------+------+----+-----+---+----+------+
|308-81-0538|     A|   Yangon|       Normal|Female|Fashion accessories|     73.05|       4|        14.61| 306.81|Credit card|292.2|            4.761904762|       14.61|   4.9|2019|    2| 25|  17|    16|
|834-83-1826|     B| Mandalay|       Member|Female| Home and lifestyle|     82.04|       5|        20.51| 430.71|Credit card|410.2|            4.761904762|       20.51|   7.6|2019|    2| 2

In [None]:
df_bronze.printSchema()
df_bronze.show(5)

In [18]:
dim_customer = df_bronze.select("customer_type", "gender").distinct()
dim_customer = dim_customer.withColumn("customer_id", monotonically_increasing_id())

In [19]:
dim_product = df_bronze.select("product_line").distinct()
dim_product = dim_product.withColumn("product_id", monotonically_increasing_id())

In [20]:
fact_sales = df_bronze.join(dim_customer, on=["customer_type", "gender"], how="inner") \
                      .join(dim_product, on="product_line", how="inner") \
                      .select(
                          col("invoice_id"),
                          col("branch"),
                          col("unit_price"),
                          col("quantity"),
                          col("tax_5_percent"),
                          col("total"),
                          col("payment"),
                          col("cogs"),
                          col("gross_margin_percentage"),
                          col("gross_income"),
                          col("rating"),
                          col("year"),
                          col("month"),
                          col("day"),
                          col("hour"),
                          col("minute"),
                          col("customer_id"),
                          col("product_id")
                      )

In [21]:
dim_customer.show()
dim_product.show()
fact_sales.show()

+-------------+------+-----------+
|customer_type|gender|customer_id|
+-------------+------+-----------+
|       Normal|  Male|          0|
|       Member|  Male|          1|
|       Normal|Female|          2|
|       Member|Female|          3|
+-------------+------+-----------+

+--------------------+----------+
|        product_line|product_id|
+--------------------+----------+
|  Home and lifestyle|         0|
| Fashion accessories|         1|
|   Health and beauty|         2|
|Electronic access...|         3|
|   Sports and travel|         4|
|  Food and beverages|         5|
+--------------------+----------+

+-----------+------+----------+--------+-------------+--------+-----------+------+-----------------------+------------+------+----+-----+---+----+------+-----------+----------+
| invoice_id|branch|unit_price|quantity|tax_5_percent|   total|    payment|  cogs|gross_margin_percentage|gross_income|rating|year|month|day|hour|minute|customer_id|product_id|
+-----------+------+----

In [22]:
dim_customer.write.parquet("spark-warehouse/dim_customer")
dim_product.write.parquet("spark-warehouse/dim_product")
fact_sales.write.parquet("spark-warehouse/fact_sales")