In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, to_timestamp, hour, minute, year, month, day
from pyspark.sql.types import StringType, DoubleType, IntegerType, DateType, TimestampType

In [None]:
spark = SparkSession.builder.master("local[*]") \
    .config("spark.executor.instances", "5") \
    .config("spark.sql.legacy.timeParserPolicy", "LEGACY").getOrCreate()

In [None]:
df = spark.read.option("Header", True).csv("spark-warehouse/landing/supermarket_sales.csv")

In [None]:
df.columns

In [None]:
df.printSchema()

In [None]:
df.show(1)

In [None]:
column_rename_and_cast = {
    'Invoice ID': ('invoice_id', StringType()),
    'Branch': ('branch', StringType()),
    'City': ('city', StringType()),
    'Customer type': ('customer_type', StringType()),
    'Gender': ('gender', StringType()),
    'Product line': ('product_line', StringType()),
    'Unit price': ('unit_price', DoubleType()),
    'Quantity': ('quantity', IntegerType()),
    'Tax 5%': ('tax_5_percent', DoubleType()),
    'Total': ('total', DoubleType()),
    'Date': ('date', StringType()), # Tratar data direto pode levar a problemas, melhor usar funções auxiliares
    'Time': ('time', StringType()), # Tratar timetamp direto pode levar a problemas, melhor usar funções auxiliares
    'Payment': ('payment', StringType()),
    'cogs': ('cogs', DoubleType()),
    'gross margin percentage': ('gross_margin_percentage', DoubleType()),
    'gross income': ('gross_income', DoubleType()),
    'Rating': ('rating', DoubleType())
}

In [None]:
for original_name, (new_name, new_type) in column_rename_and_cast.items():
    df = df.withColumnRenamed(original_name, new_name) \
           .withColumn(new_name, col(new_name).cast(new_type))

In [None]:
df.printSchema()

In [None]:
df.explain()

In [None]:
df.show()

In [None]:
df = df.withColumn("year", year(to_date(col("date"), 'MM/dd/yyyy'))) \
        .withColumn("month", month(to_date(col("date"), 'MM/dd/yyyy'))) \
        .withColumn("day", day(to_date(col("date"), 'MM/dd/yyyy'))) \
        .withColumn("hour", hour(to_timestamp(col("time"), 'HH:mm'))) \
        .withColumn("minute", minute(to_timestamp(col("time"), 'HH:mm'))) \
        .drop(col("date")) \
        .drop(col("time"))

In [None]:
df.printSchema()

In [None]:
df.show()

In [None]:
df.write.saveAsTable("supermarket_sales_bronze", format="parquet", mode="overwrite", partitionBy=["year", "month", "day", "hour", "minute"], path="bronze/supermarket_sales")