In [0]:
# Importing the required libraries
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
from pyspark.sql import functions as F


In [0]:
# Paths to the silver and gold layers
silver_layer = "/mnt/Prajwal/Capstone_Project/silver/bankloandetails"
gold_layer = "/mnt/Prajwal/Capstone_Project/gold/dim_customer"

# Checking missing records
df_silver = spark.read.format("delta").load(silver_layer)
df_gold = spark.read.format("delta").load(gold_layer)



In [0]:

missing_records = df_silver.join(df_gold, on="customer_id", how="left").filter(df_gold.customer_id.isNotNull()).select(df_silver.customer_id)

print("Total records in Loan details :", df_silver.count())
print("Total missing records in Customer Gold table but present in Loan details:", missing_records.count())

In [0]:
# Filter out records that have a matching customer_id in the Gold table using a left_semi join
valid_customers_df = df_silver.join(
    df_gold,
    df_silver.customer_id == df_gold.customer_id,
    how="left_semi"
)

# Print the count of valid records
print("Total valid records in Loan Details which has matching entry in Customer Gold table:", valid_customers_df.count())


In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, DateType, StringType, DoubleType, TimestampType, BooleanType, DecimalType
from delta.tables import DeltaTable
from pyspark.sql.functions import current_timestamp, lit

loan_schema = StructType([
    StructField("loan_id", StringType(), True),
    StructField("customer_id", StringType(), True),
    StructField("current_loan_amount", DecimalType(10,2), True),
    StructField("term", StringType(), True),
    StructField("credit_score", StringType(), True),
    StructField("credit_category", StringType(), True),
    StructField("annual_income", DecimalType(10,2), True),
    StructField("years_in_current_job", StringType(), True),
    StructField("home_ownership", StringType(), True),
    StructField("purpose", StringType(), True),
    StructField("loan_start_date", DateType(), True),
    StructField("loan_start_date_flag", StringType(), True),
    StructField("start_date", TimestampType(), True),
    StructField("end_date", TimestampType(), True),
    StructField("is_active", BooleanType(), True)
])

gold_path = "/mnt/Prajwal/Capstone_Project/gold/dim_loan_details"

# Create the Gold table if it doesn't exist
try:
    dim_table = spark.read.format("delta").load(gold_path)
except:
    # If the table doesn't exist, create an empty one with the defined schema
    empty_df = spark.createDataFrame([], loan_schema)
    empty_df.write.format("delta").mode("overwrite").save(gold_path)
    spark.sql(f"CREATE TABLE IF NOT EXISTS banking.Dim_Loan_Details USING DELTA LOCATION '{gold_path}'")

In [0]:
# Load Gold dimension table
dim_table = DeltaTable.forPath(spark, gold_path)


# Load Gold dimension table
dim_table = DeltaTable.forPath(spark, gold_path)

dim_table.alias("dim").merge(
    valid_customers_df.alias("src"),
    "dim.loan_id = src.loan_id AND dim.is_active = true"
).whenMatchedUpdate(set={
    "end_date": current_timestamp(),
    "is_active": lit(False)
}) \
 .whenNotMatchedInsert(values={
    "loan_id": "src.loan_id",
    "customer_id": "src.customer_id",
    "current_loan_amount": "src.current_loan_amount",
    "term": "src.term",
    "credit_score": "src.credit_score",
    "credit_category": "src.credit_category",
    "annual_income": "src.annual_income",
    "years_in_current_job": "src.years_in_current_job",
    "home_ownership": "src.home_ownership",
    "purpose": "src.purpose",
    "loan_start_date": "src.loan_start_date",
    "loan_start_date_flag":"src.loan_start_date_flag",
    "start_date": current_timestamp(),
    "end_date": lit(None).cast(TimestampType()),
    "is_active": lit(True)
}) \
 .execute()

In [0]:
%sql
select * from banking.loan_details_dim