In [0]:
"""
Process Customer Insights - Data Cleansing & Standardization
"""

#importing libraries
from pyspark.sql.functions import col, lit, trim, lower, regexp_replace, round, when
from pyspark.sql import DataFrame


In [0]:
#defning path for both bronze and silver 
bronze_path = "/mnt/Prajwal/Retail_sales_usecase/bronzeCInsights"
silver_path = "/mnt/Prajwal/Retail_sales_usecase/Silver/SilverCInsights"

#reading data from bronze
df = spark.read.format("parquet").load(bronze_path)

#dropping duplicates
df = df.dropDuplicates(["customer_id"])

# Casting columns to proper format except customer_id and ingestion_time
for column in df.columns:
    if column not in ["customer_id","order_frequency", "ingest_time"]:
        df = df.withColumn(column, col(column).cast("Integer"))

#Trimming order frequency
df = df.withColumn("order_frequency", trim(col("order_frequency")))

#writing data to silver layer
df.write.format("delta").mode("overwrite").save(silver_path)

In [0]:
#counting source and destination

df_b = spark.read.format("parquet").load(bronze_path)
df_s = spark.read.format("delta").load(silver_path)
print("Number of records in bronze table:", df_b.count())
print("Number of records in silver table:", df_s.count())

In [0]:
#read the file from silver path
df = spark.read.format("delta").load(silver_path)
df.display()