In [0]:
"""
# Referential Integrity Check Before fact_sales Merge
# Before merging fact_sales, check if all customer_id values exist in dim_customer.
# Identify orphaned customer_id values in fact_sales
# If any records appear in this query, it means customer_id values exist in fact_sales but not in dim_customer.
# You can reject, log, or hold these records until their dimensions arrive.

"""

In [0]:
silver_path = "/mnt/Prajwal/Capstone_Project/silver/Loanpayments"

silver_df = spark.read.format("delta").load(silver_path)


display(silver_df)

# List all tables in the banking schema
tables = spark.sql("SHOW TABLES IN prajwal")
display(tables)


In [0]:
# Perform a left join to identify orphaned

dim_loan_detail_gold = spark.table("prajwal.loan_details_dim")

orphaned_records = silver_df.join(dim_loan_detail_gold, on="loan_id", how="left").filter(dim_loan_detail_gold.loan_id.isNotNull()).select("loan_id")

display(orphaned_records.distinct().count())

In [0]:
# Import the required libraries
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
from delta.tables import DeltaTable
from pyspark import StorageLevel

In [0]:
customer_df = spark.read.format("delta").load("/mnt/Prajwal/Capstone_Project/silver/bankcustomer_source")
loan_details_df = spark.read.format("delta").load("/mnt/Prajwal/Capstone_Project/silver/bankloandetails")
loan_pay_df = spark.read.format("delta").load("/mnt/Prajwal/Capstone_Project/silver/Loanpayments")

display(customer_df.count())
display(loan_details_df.count())
display(loan_pay_df.count())

In [0]:
display(loan_pay_df)

In [0]:
display(loan_details_df)

In [0]:
loan_pay_df.printSchema()
loan_details_df.printSchema()

In [0]:
Fact_table = loan_pay_df.join(
    loan_details_df,
    on="loan_id",
    how="left"
)

display(Fact_table)

In [0]:
from pyspark.sql import functions as F

# Calculate late payment count and add as a new column
late_payment_counts = Fact_table.filter(F.col("payment_status") == "LATE").groupBy("loan_id").agg(F.count("*").alias("late_payment_count"))

Fact_table = Fact_table.join(late_payment_counts, on="loan_id", how="left")

# Calculate MISSED payment count and add as a new column
missed_payment_counts = Fact_table.filter(F.col("payment_status") == "MISSED").groupBy("loan_id").agg(F.count("*").alias("missed_payment_count"))

Fact_table = Fact_table.join(missed_payment_counts, on="loan_id", how="left")

display(Fact_table)

In [0]:
from pyspark.sql.functions import col, when, lit

# Based on the number of Missed or Late payments and Credit Score of the customer, create Default_Risk_Category
Fact_table = Fact_table.withColumn(
    "Default_Risk_Category",
    when(
        ((col("missed_payment_count") > 2) | (col("late_payment_count") > 3)) & (col("credit_score") < 600),
        lit("High Risk")
    ).when(
        ((col("missed_payment_count") > 1) | (col("late_payment_count") > 2)) & (col("credit_score").between(600, 650)),
        lit("Medium Risk")
    ).otherwise(lit("Low Risk"))
)

display(Fact_table)

In [0]:
from pyspark.sql.functions import datediff, col, floor

# Calculate Customer Loan Tenure in days
Fact_table = Fact_table.withColumn(
    "loan_tenure_days",
    datediff(col("Payment_Date"), col("loan_start_date"))
)

# Convert loan tenure to years and months
Fact_table = Fact_table.withColumn(
    "loan_tenure_years",
    floor(col("loan_tenure_days") / 365)
).withColumn(
    "loan_tenure_months",
    floor((col("loan_tenure_days") % 365) / 30)
)

display(Fact_table)

In [0]:
# Cache Fact table
Fact_table.select("Payment_ID","Loan_ID","customer_id","Payment_Amount","Payment_Date","Payment_Status","Balance_Amount","Penalty_Amount","late_payment_count","missed_payment_count","Default_Risk_Category","loan_tenure_days","loan_tenure_years","loan_tenure_months").cache()

In [0]:
%python
from delta.tables import DeltaTable

gold_path = "/mnt/Prajwal/Capstone_Project/Gold_clone/Fact_Payemnt"

# Remove duplicates from the source DataFrame
Fact_table = Fact_table.dropDuplicates(["payment_id"]).drop("ingest_time")

if DeltaTable.isDeltaTable(spark, gold_path):
    delta_table = DeltaTable.forPath(spark, gold_path)

    delta_table.alias("target").merge(
        Fact_table.alias("source"),
        "target.payment_id = source.payment_id"
    ).whenMatchedUpdateAll() \
     .whenNotMatchedInsertAll() \
     .execute()
else:
    Fact_table.write.format("delta") \
        .mode("overwrite") \
        .partitionBy("customer_id") \
        .save(gold_path)

In [0]:
spark.sql(f"""
    CREATE TABLE IF NOT EXISTS prajwal.Fact_Payment
    USING DELTA
    LOCATION '{gold_path}'
""")

In [0]:
%sql
select * from prajwal.Fact_Payment