In [0]:
# Importing the required libraries
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, DateType, StringType, DoubleType, TimestampType, BooleanType, DecimalType
from delta.tables import DeltaTable
from pyspark.sql.functions import current_timestamp, lit


In [0]:
# Paths to the silver and gold layers
silver_layer = "/mnt/Prajwal/Capstone_Project/silver/bankloandetails"
gold_layer = "/mnt/Prajwal/Capstone_Project/Gold_clone/dim_customer"  # to compare customer data with loan deatils

gold_path = "/mnt/Prajwal/Capstone_Project/Gold_clone/Dim_Loan_details"  # for loan deatil details 

# Checking missing records
df_silver = spark.read.format("delta").load(silver_layer)
df_gold = spark.read.format("delta").load(gold_layer)

In [0]:
# Ignoring the orphan records

missing_records = df_silver.join(df_gold, on="customer_id", how="left").filter(df_gold.customer_id.isNotNull()).select(df_silver.customer_id)

display(missing_records)
print("Total records in Loan details :", df_silver.count())
print("Total missing records in Customer Gold table but present in Loan details:", missing_records.count())

In [0]:
# Drop duplicates in missing records
missing_records = missing_records.dropDuplicates(["customer_id"])

display(missing_records)

In [0]:
# Filter out records that have a matching customer_id in the Gold table using a left_semi join
valid_customers_df = df_silver.join(
    df_gold,
    df_silver.customer_id == df_gold.customer_id,
    how="left_semi"
)

display(valid_customers_df)
# Print the count of valid records
print("Total valid records in Loan Details which has matching entry in Customer Gold table:", valid_customers_df.count())


In [0]:
# Drop duplicates based on 'customer_id' in the 'valid_customers_df' DataFrame
valid_customers_df = valid_customers_df.dropDuplicates(["customer_id"])

display(valid_customers_df)

In [0]:
from pyspark.sql.functions import lit, col, coalesce, max

valid_customers_df = valid_customers_df.withColumn("loan_detail_key", lit(None))
max_key = valid_customers_df.agg(coalesce(max("loan_detail_key"), lit(0))).collect()[0][0]

display(valid_customers_df)
display(max_key)

In [0]:
window_spec = Window.orderBy("Customer_Id","Loan_Id")
valid_customers_df = valid_customers_df.withColumn("rn", row_number().over(window_spec)) \
                                    .withColumn("loan_detail_key", col("rn") + lit(max_key)) \
                                    .drop("rn")


display(valid_customers_df)

In [0]:
loan_schema = StructType([
    StructField("loan_detail_key", IntegerType(), True),
    StructField("loan_id", StringType(), True),
    StructField("customer_id", StringType(), True),
    StructField("current_loan_amount", DecimalType(10,2), True),
    StructField("term", StringType(), True),
    StructField("credit_score", StringType(), True),
    StructField("credit_category", StringType(), True),
    StructField("annual_income", DecimalType(10,2), True),
    StructField("years_in_current_job", StringType(), True),
    StructField("home_ownership", StringType(), True),
    StructField("purpose", StringType(), True),
    StructField("loan_start_date", DateType(), True),
    StructField("start_date", TimestampType(), True),
    StructField("end_date", TimestampType(), True),
    StructField("is_active", BooleanType(), True)
])

In [0]:
full_table_name="Prajwal.loan_details_dim"

try:
    loan_details_dim = spark.read.format("delta").load(gold_pathh)
except:
    empty_df = spark.createDataFrame([], loan_schema)
    empty_df.write.format("delta").mode("overwrite").save(gold_path)
    spark.sql(f"CREATE TABLE IF NOT EXISTS {full_table_name} USING DELTA LOCATION '{gold_path}'")

In [0]:
loan_details_dim = DeltaTable.forPath(spark, gold_path)
merge_condition = "tgt.customer_id = src.customer_id AND tgt.is_active = true"

update_action = {
    "end_date": current_timestamp(),
    "is_active": lit(False)
}

insert_action = {
    "loan_detail_key": "src.loan_detail_key",
    "loan_id": "src.loan_id",
    "customer_id": "src.customer_id",
    "current_loan_amount": "src.current_loan_amount",
    "term": "src.term",
    "credit_score": "src.credit_score",
    "credit_category": "src.credit_category",
    "annual_income": "src.annual_income",
    "years_in_current_job": "src.years_in_current_job",
    "home_ownership": "src.home_ownership",
    "purpose": "src.purpose",
    "loan_start_date": "src.loan_start_date",
    "start_date": current_timestamp(),
    "end_date": lit(None).cast(TimestampType()),
    "is_active": lit(True)
}

loan_details_dim_delta.alias("tgt").merge(
    valid_customers_df.alias("src"),
    merge_condition
).whenMatchedUpdate(set=update_action) \
 .whenNotMatchedInsert(values=insert_action) \
 .execute()





In [0]:
spark.sql("select * from prajwal.loan_details_dim").display()