In [10]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, to_timestamp, year, month, day, hour, minute
from pyspark.sql.types import StringType, DoubleType, IntegerType

In [11]:
spark = SparkSession.builder.master("local[*]") \
    .config("spark.executor.instances", "5") \
    .config("spark.sql.legacy.timeParserPolicy", "LEGACY").getOrCreate()

In [12]:
df_bronze = spark.read.parquet("spark-warehouse/bronze/supermarket_sales")


In [13]:
df_bronze.printSchema()
df_bronze.show(5)

root
 |-- invoice_id: string (nullable = true)
 |-- branch: string (nullable = true)
 |-- city: string (nullable = true)
 |-- customer_type: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- product_line: string (nullable = true)
 |-- unit_price: double (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- tax_5_percent: double (nullable = true)
 |-- total: double (nullable = true)
 |-- payment: string (nullable = true)
 |-- cogs: double (nullable = true)
 |-- gross_margin_percentage: double (nullable = true)
 |-- gross_income: double (nullable = true)
 |-- rating: double (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- minute: integer (nullable = true)

+-----------+------+---------+-------------+------+-------------------+----------+--------+-------------+-------+-----------+-----+-----------------------+------------+------+----+-----+-

In [7]:
df_customers = df_bronze.select(
    col("invoice_id"),
    col("customer_type"),
    col("gender"),
    col("city")
).distinct()

df_customers = df_customers.withColumnRenamed("invoice_id", "customer_id")

In [9]:
df_customers.printSchema()
df_customers.show(5)

root
 |-- customer_id: string (nullable = true)
 |-- customer_type: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- city: string (nullable = true)

+-----------+-------------+------+---------+
|customer_id|customer_type|gender|     city|
+-----------+-------------+------+---------+
|115-38-7388|       Member|Female|Naypyitaw|
|189-08-9157|       Normal|Female|Naypyitaw|
|339-18-7061|       Member|Female|Naypyitaw|
|873-95-4984|       Member|Female| Mandalay|
|672-51-8681|       Member|Female|Naypyitaw|
+-----------+-------------+------+---------+
only showing top 5 rows

