In [0]:
# Importing important function
import os
from datetime import datetime
import time
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType, IntegerType
import pyspark.sql.functions as F
from pyspark.sql.functions import *

In [0]:
%python
# Define paths for bronze and silver zones
bronze_path = "/mnt/Prajwal/Capstone_Project/bronze/bankloandetails"
silver_path = "/mnt/Prajwal/Capstone_Project/silver/bankloandetails"

df_bronze = spark.read.parquet(bronze_path)

# Rename columns
df_bronze = df_bronze.withColumnRenamed("Loan ID", "loan_id") \
                     .withColumnRenamed("customer ID", "customer_id") \
                     .withColumnRenamed("Current Loan Amount", "current_loan_amount") \
                     .withColumnRenamed("Credit Score", "credit_score") \
                     .withColumnRenamed("annual income", "annual_income") \
                     .withColumnRenamed("years in current job", "years_in_current_job") \
                     .withColumnRenamed("home ownership", "home_ownership") \
                     .withColumnRenamed("loan_start_dt  ", "loan_start_date")

# Fill missing values
df_bronze = df_bronze.fillna({
    "credit_score": 0,
    "annual_income": 0,
    "home_ownership": "Unknown",
    "purpose": "Unknown"
})

# Remove negative sign from current_loan_amount
df_bronze = df_bronze.withColumn("current_loan_amount", regexp_replace(col("current_loan_amount"), "^-", ""))

# Cast columns to appropriate data types
df_bronze = df_bronze.withColumn("current_loan_amount", col("current_loan_amount").cast("decimal(10,2)"))
df_bronze = df_bronze.withColumn("annual_income", col("annual_income").cast("decimal(10,2)"))

# Handle invalid values in current_loan_amount and annual_income
df_bronze = df_bronze.withColumn("current_loan_amount", when(col("current_loan_amount") < 900, 0).otherwise(col("current_loan_amount")))
df_bronze = df_bronze.withColumn("annual_income", when(col("annual_income") < 0, 0).otherwise(col("annual_income")))

# Standardize home_ownership column
df_bronze = df_bronze.withColumn(
    "home_ownership",
    when(lower(trim(col("home_ownership"))).contains("mortgage"), lit("Home Mortgage"))
    .when(lower(trim(col("home_ownership"))).contains("rent"), lit("Rent"))
    .when(lower(trim(col("home_ownership"))).contains("own"), lit("Own Home"))
    .otherwise(lit("Unknown"))
)

# Handle 'N/A' in years_in_current_job
df_bronze = df_bronze.withColumn("years_in_current_job", when(lower(col("years_in_current_job")) == "n/a", 0).otherwise(col("years_in_current_job")))

# Create credit_category column
df_bronze = df_bronze.withColumn(
    "credit_category",
    when(col("credit_score") < 580, "Poor")
    .when((col("credit_score") >= 580) & (col("credit_score") <= 669), "Fair")
    .when((col("credit_score") >= 670) & (col("credit_score") <= 739), "Good")
    .when((col("credit_score") >= 740) & (col("credit_score") <= 799), "Very Good")
    .when(col("credit_score") >= 800, "Exceptional")
    .otherwise(None)
)

# Parse loan_start_date with multiple date formats
df_bronze = df_bronze.withColumn(
    "loan_start_date",
    coalesce(
        to_date(col("loan_start_date"), "d-MMM-yy"),
        to_date(col("loan_start_date"), "M/d/yyyy"),
        to_date(col("loan_start_date"), "MMM dd-yyyy"),
        to_date(col("loan_start_date"), "yyyy-MM-dd"),
        to_date(col("loan_start_date"), "MMMM dd, yyyy"),
        to_date(col("loan_start_date"), "MM/dd/yyyy"),
        to_date(col("loan_start_date"), "MMM-dd/yyyy"),
        to_date(col("loan_start_date"), "MMM-d/yyyy")
    )
)

df_bronze = df_bronze.withColumn("loan_start_date_flag", when(df_bronze.loan_start_date.isNull(), lit("missing")).otherwise(lit("present")))

# Clean the target path before writing
dbutils.fs.rm(silver_path, True)

# Write to Silver layer Delta format
df_bronze.write.format("delta").mode("overwrite").save(silver_path)

In [0]:
# Read from Silver layer delta form

df_silver = spark.read.format("delta").load(silver_path)
display(df_silver)
