In [0]:
"""
silver zone is responsible for cleaning, standardization, and applying business rules.
Auto Optimize & Small File Compaction (Delta Optimization)
Memory & Shuffle Optimizations (if transformations occur)
Data Deduplication (Removing Duplicates)
Data Type Optimization (Ensuring efficient storage & performance)

"""

In [0]:
%python
# Importing necessary libraries
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
from pyspark.sql.functions import substring
from pyspark.sql.functions import trim

# Define paths for bronze and silver zones
bronze_path = "/mnt/Prajwal/Capstone_Project/bronze/Loanpayments"
silver_path = "/mnt/Prajwal/Capstone_Project/silver/Loanpayments"

schema = StructType([
    StructField("Loan_ID", StringType(), True),
    StructField("Payment_ID", StringType(), True),
    StructField("Payment_Amount", StringType(), True),
    StructField("Payment_Date", StringType(), True),
    StructField("Payment_Status", StringType(), True),
    StructField("Balance_Amount", StringType(), True),
    StructField("Penalty_Amount", StringType(), True),
    StructField("ingest_time", TimestampType(), True)
])

df = spark.read.format("csv").schema(schema).option("header", "False").load("/mnt/Prajwal/Capstone_Project/Source_Files/Loanpayments.csv")


df = df.withColumn("ingest_time", lit("2025-05-02 09:56:01").cast(TimestampType()))

df = df.select([trim(col(c)).alias(c) for c in df.columns])

df = df.withColumn("Payment_Amount", col("Payment_Amount").cast(DoubleType())) \
       .withColumn("Balance_Amount", col("Balance_Amount").cast(DoubleType())) \
       .withColumn("Penalty_Amount", col("Penalty_Amount").cast(DoubleType()))

df = df.dropDuplicates(["loan_id"]).dropna()


df = df.withColumn("Payment_Date",
                         when(to_date(col("payment_date"), "M/d/yyyy").isNotNull(), to_date(col("payment_date"), "M/d/yyyy"))
                         .when(to_date(col("payment_date"), "d-MMM-yy").isNotNull(), to_date(col("payment_date"), "d-MMM-yy"))
                         .otherwise(None))

df = df.withColumn("Payment_Status",  (trim(upper(col("Payment_Status"))).alias("Payment_Status")))




In [0]:
df.write.mode("overwrite").format("delta").save(silver_path)

In [0]:
# Read from the silver zone
silver_df = spark.read.format("delta").load(silver_path)
display(silver_df)

In [0]:
from pyspark.sql.functions import current_date, lit

# Define the new data to be merged
new_data_df = ...

# Add effective date columns to the new data
new_data_df = new_data_df.withColumn("effective_date", current_date()).withColumn("end_date", lit(None))

# Read the existing data from the silver zone
silver_df = spark.read.format("delta").load(silver_path)

# Merge the new data with the existing data to implement SCD Type 2
from delta.tables import DeltaTable

delta_table = DeltaTable.forPath(spark, silver_path)

# Define the merge condition
merge_condition = "silver_df.id = new_data_df.id AND silver_df.end_date IS NULL"

# Perform the merge
delta_table.alias("silver_df").merge(
    new_data_df.alias("new_data_df"),
    merge_condition
).whenMatchedUpdate(
    set={"end_date": current_date()}
).whenNotMatchedInsertAll().execute()

# Display the updated silver zone data
updated_silver_df = spark.read.format("delta").load(silver_path)
display(updated_silver_df)