In [0]:
%python
# Cell 1: Define paths for bronze and silver data
bronze_path = '/mnt/Prajwal/Retail_sales_usecase/bronzeODetails'
silver_path = '/mnt/Prajwal/Retail_sales_usecase/Silver/silverODetails'

# Cell 2: Read data from the bronze path
df = spark.read.format("parquet").load(bronze_path)

# Cell 3: Define UDF to parse dates and convert to timestamp
from pyspark.sql.functions import col, to_timestamp
from pyspark.sql.types import TimestampType
from pyspark.sql.functions import udf
from datetime import datetime

def parse_with_year(date_str):
    try:
        if not date_str:
            return None
        
        # Remove leading and trailing spaces
        date_str = date_str.strip()
        
        # Try parsing with two-digit year format
        try:
            date_obj = datetime.strptime(date_str, '%m/%d/%y %H:%M')
        except ValueError:
            # If it fails, try parsing with four-digit year format
            date_obj = datetime.strptime(date_str, '%m/%d/%Y %H:%M')

        # Adjust the year if it is greater than the current year + 5
        if date_obj.year > datetime.now().year + 5:
            date_obj = date_obj.replace(year=date_obj.year - 100)
            
        return date_obj
    except Exception as e:
        return None

# Register the UDF
parse_udf = udf(parse_with_year, TimestampType())

# Apply the UDF to the order_date column
df = df.withColumn("order_date", parse_udf(col("order_date")))

# Cell 4: Data cleaning and transformation
from pyspark.sql.functions import col, regexp_replace, trim, monotonically_increasing_id

# Renaming columns: order_value to order_amount and branch_code to store_code
df = df.withColumnRenamed("order_value", "order_amount").withColumnRenamed("branch_code", "store_code")

# Fill missing values for state, country, and order_channel
df = df.fillna(value={"state": "Unknown", "country": "Unknown", "order_channel": "Unknown"})

# Cast order_amount to float and round to 2 decimal places
df = df.withColumn("order_amount", col("order_amount").cast("float").cast("decimal(10,2)"))

# Remove special characters from store_code column
df = df.withColumn("store_code", regexp_replace(col("store_code"), r"[^a-zA-Z0-9]", ""))

# Trim spaces for order_channel, state, and country columns
df = df.withColumn("order_channel", trim(col("order_channel")))
df = df.withColumn("state", trim(col("state")))
df = df.withColumn("country", trim(col("country")))

# Define valid mapping (standardize the value) for order_channel
mapping = {"online": "Online",
           "instore": "In-Store",
           "phone": "Phone"
}
df = df.replace(mapping, subset=["order_channel"])

# Add surrogate key for orders
df = df.withColumn("order_sk", monotonically_increasing_id())

# Writing data to silver path
df.write.mode("overwrite").format("delta").save(silver_path)



In [0]:
# Read loaded file from Silver layer
df = spark.read.format("delta").load(silver_path)
display(df)