In [0]:
#import all libraries
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import *

In [0]:
# Define bronze and silver path 
bronze_path = "/mnt/Prajwal/Retail_sales_usecase/bronzePDetails"
silver_path = "/mnt/Prajwal/Retail_sales_usecase/Silver/SilverPDetails"

In [0]:
%python
# Read the parquet file from the bronze path
df = spark.read.format("parquet").option("header", "true").load(bronze_path)

# Split the 'value' column into separate columns
df = df.withColumn("product_id", split(df['value'], '\|').getItem(0)) \
       .withColumn("product_name", split(df['value'], '\|').getItem(1)) \
       .withColumn("category", split(df['value'], '\|').getItem(2)) \
       .withColumn("price", split(df['value'], '\|').getItem(3)) \
       .withColumn("brand", split(df['value'], '\|').getItem(4)) \
       .withColumn("in_stock", split(df['value'], '\|').getItem(5))

# Drop the original 'value' column
df = df.drop('value')

# Filter out rows where 'product_id' is 'product_id' (assuming the first row has headers)
df = df.filter(df.product_id != 'product_id')

# Cast 'price' column to float and round to 2 decimal places
df = df.withColumn("price", round(col("price").cast("float"), 2))

# Fill null values with specified default values
df = df.fillna({
    'product_id': 'unknown',
    'product_name': 'unknown',
    'category': 'unknown',
    'price': 0.0,
    'brand': 'unknown',
    'in_stock': 'false'
})

# Change ingestion time to format of yyyy-mm-dd hh:mm:ss and drop 'ingest_time' column
df = df.withColumn('ingestion_time', date_format(col('ingest_time'), 'yyyy-MM-dd HH:mm:ss')).drop('ingest_time')

# Trim whitespace from all columns
df = df.select([trim(col(c)).alias(c) for c in df.columns])

# Drop duplicate rows based on 'product_id'
df = df.dropDuplicates(["product_id"])

# Write the DataFrame to the silver path
df.write.format("delta").mode("overwrite").save(silver_path)

In [0]:
#Read the file from the silver path
silver_df = spark.read.format("delta").load(silver_path)
display(silver_df)