In [0]:
from pyspark.sql.functions import col, count, when

customer_df = (
    spark.read
    .format("parquet")
    .load("/Volumes/real_time_projects/ecommerce_historical/lakehouse_vol/bronze/customers/")
)

# Cast columns to appropriate data types
customer_df_casted = (
    customer_df
    .withColumn(
        "customer_zip_code_prefix",
        col("customer_zip_code_prefix").cast("int")
    )
)

# customer_df_casted.printSchema()

# Display the number of null values in each column of the DataFrame.
display(
    customer_df_casted.select([
        count(
            when(
                col(c).isNull(),
                c
            )
        ).alias(c)
        for c in customer_df_casted.columns
    ])
)

# Check for duplicate keys
duplicate_keys_count = (
    customer_df_casted
    .groupBy("customer_id", "customer_unique_id")
    .count()
    .filter("count > 1")
    .count()
)

print(f"Duplicate key groups: {duplicate_keys_count}")

# Drop Null values & Duplicates from the business keys
customer_df_silver = (
     customer_df_casted
     .dropna(subset=["customer_id", "customer_unique_id"])
     .dropDuplicates(["customer_id", "customer_unique_id"])
 )

# Write to Delta table
customer_df_silver.write \
     .format("delta") \
     .mode("overwrite") \
     .saveAsTable("real_time_projects.ecommerce_historical.customers")

In [0]:
from pyspark.sql.functions import col

order_items_df = (
    spark.read
    .format("parquet")
    .load("/Volumes/real_time_projects/ecommerce_historical/lakehouse_vol/bronze/order_items/"))

# Cast columns to appropriate data types
order_items_df_casted = (
    order_items_df
    .withColumn("shipping_limit_date",
                col("shipping_limit_date").cast("timestamp"))
    .withColumn("price",
                col("price").cast("double"))
    .withColumn("freight_value",
                col("freight_value").cast("double"))
)

# order_items_df_casted.printSchema()

# Display the number of null values in each column of the DataFrame. 
display(
    order_items_df_casted.select([
        count(when(col(c).isNull(), c)).alias(c)
        for c in order_items_df_casted.columns
    ])
)

# Check for duplicate keys
duplicate_keys_count = (
    order_items_df
    .groupBy("order_id", "order_item_id")
    .count()
    .filter("count > 1")
    .count()
)

print(f"Duplicate key groups: {duplicate_keys_count}")

# Drop Null values & Duplicates from the business keys
order_items_df_silver = (
    order_items_df_casted
    .dropna(subset=["order_id", "order_item_id"])
    .dropDuplicates(["order_id", "order_item_id"])
)

# Write to Delta table
order_items_df_silver.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("real_time_projects.ecommerce_historical.order_items")

In [0]:
from pyspark.sql.functions import col

order_payments_df = (
    spark.read
    .format("parquet")
    .load("/Volumes/real_time_projects/ecommerce_historical/lakehouse_vol/bronze/payments/"))

# Cast columns to appropriate data types
order_payments_df_casted = (
    order_payments_df
    .withColumn("payment_sequential",
                col("payment_sequential").cast("int"))
    .withColumn("payment_installments",
                col("payment_installments").cast("int"))
    .withColumn("payment_value",
                col("payment_value").cast("double"))
)

# order_payments_df_casted.printSchema()

# Display the number of null values in each column of the DataFrame. 
display(
    order_payments_df_casted.select([
        count(when(col(c).isNull(), c)).alias(c)
        for c in order_payments_df_casted.columns
    ])
)

# Check for duplicate keys
duplicate_keys_count = (
    order_payments_df_casted
    .groupBy("order_id", "payment_sequential")
    .count()
    .filter("count > 1")
    .count()
)

print(f"Duplicate key groups: {duplicate_keys_count}")

# Drop Null values & Duplicates from the business keys
order_payment_df_silver = (
    order_payments_df_casted
    .dropna(subset=["order_id", "payment_sequential"])
    .dropDuplicates(["order_id", "payment_sequential"])
)

# Write to Delta table
order_payment_df_silver.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("real_time_projects.ecommerce_historical.payments")

In [0]:
from pyspark.sql.functions import col, count, when, current_timestamp

orders_df = (
    spark.read
    .format("parquet")
    .load("/Volumes/real_time_projects/ecommerce_historical/lakehouse_vol/bronze/orders/")
)

# Cast columns to appropriate data types
orders_df_casted = (
    orders_df
    .withColumn("order_purchase_timestamp", col("order_purchase_timestamp").cast("timestamp"))
    .withColumn("order_approved_at", col("order_approved_at").cast("timestamp"))
    .withColumn("order_delivered_carrier_date", col("order_delivered_carrier_date").cast("timestamp"))
    .withColumn("order_delivered_customer_date", col("order_delivered_customer_date").cast("timestamp"))
    .withColumn("order_estimated_delivery_date", col("order_estimated_delivery_date").cast("timestamp"))
)

# Display the number of null values in each column of the DataFrame. 
display(
    orders_df_casted.select([
        count(when(col(c).isNull(), c)).alias(c)
        for c in orders_df_casted.columns
    ])
)

# Check for duplicate keys
duplicate_keys_count = (
    orders_df_casted
    .groupBy("order_id", "customer_id")
    .count()
    .filter("count > 1")
    .count()
)

print(f"Duplicate key groups: {duplicate_keys_count}")

# Drop Null values & Duplicates from the business keys
business_keys = ["order_id", "customer_id"]

orders_df_silver = (
    orders_df_casted
    .dropna(subset=business_keys)
    .dropDuplicates(business_keys)
)

# Fill NULL values for other columns with current_timestamp()
orders_df_silver_cleaned = (
    orders_df_silver
    .withColumn(
        "order_approved_at",
        when(
            col("order_approved_at").isNull(),
            current_timestamp()
        ).otherwise(col("order_approved_at"))
    )
    .withColumn(
        "order_delivered_carrier_date",
        when(
            col("order_delivered_carrier_date").isNull(),
            current_timestamp()
        ).otherwise(col("order_delivered_carrier_date"))
    )
    .withColumn(
        "order_delivered_customer_date",
        when(
            col("order_delivered_customer_date").isNull(),
            current_timestamp()
        ).otherwise(col("order_delivered_customer_date"))
    )
)

# Display the number of null values in each column of the DataFrame.
display(
    orders_df_silver_cleaned.select([
        count(when(col(c).isNull(), c)).alias(c)
        for c in orders_df_casted.columns
    ])
)

# Write to Delta table
orders_df_silver_cleaned.write\
    .format("delta")\
    .mode("overwrite")\
    .saveAsTable("real_time_projects.ecommerce_historical.orders")

In [0]:
from pyspark.sql.functions import col

products_df = (
    spark.read
    .format("parquet")
    .load("/Volumes/real_time_projects/ecommerce_historical/lakehouse_vol/bronze/products/"))

# Cast columns to appropriate data types
products_df_casted = (
    products_df
    .withColumn("product_name_lenght",
                col("product_name_lenght").cast("int"))
    .withColumn("product_description_lenght",
                col("product_description_lenght").cast("int"))
    .withColumn("product_photos_qty",
                col("product_photos_qty").cast("int"))
    .withColumn("product_weight_g",
                col("product_weight_g").cast("int"))
    .withColumn("product_length_cm",
                col("product_length_cm").cast("int"))
    .withColumn("product_height_cm",
                col("product_height_cm").cast("int"))
    .withColumn("product_width_cm",
                col("product_width_cm").cast("int"))
)

# products_df_casted.printSchema()

# Display the number of null values in each column of the DataFrame. 
display(
    products_df_casted.select([
        count(when(col(c).isNull(), c)).alias(c)
        for c in products_df_casted.columns
    ])
)

# Check for duplicate keys
duplicate_keys_count = (
    products_df_casted
    .groupBy("product_id")
    .count()
    .filter("count > 1")
    .count()
)

print(f"Duplicate key groups: {duplicate_keys_count}")

# Drop Null values & Duplicates from the business keys

product_df_silver = (
    products_df_casted
    # 1️⃣ Drop rows with NULL business key
    .dropna(subset=["product_id"])

    # 2️⃣ Drop duplicate product_id
    .dropDuplicates(["product_id"])

    # 3️⃣ Fill NULLs with default values
    .fillna({
        "product_category_name": "UNKNOWN",
        "product_name_lenght": 0,
        "product_description_lenght": 0,
        "product_photos_qty": 0,
        "product_weight_g": 0,
        "product_length_cm": 0,
        "product_height_cm": 0,
        "product_width_cm": 0
    })
)

# Display the number of null values in each column of the DataFrame.
display(
    product_df_silver.select([
        count(when(col(c).isNull(), c)).alias(c)
        for c in product_df_silver.columns
    ])
)

# Write to Delta table
product_df_silver.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("real_time_projects.ecommerce_historical.products")