In [0]:
# import important libraries
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [0]:
customer_df = "/mnt/Prajwal/Capstone_Project/Gold_clone/dim_customer"

loan_details_df = "/mnt/Prajwal/Capstone_Project/Gold_clone/Dim_Loan_details"

customer_df = spark.read.format("delta").load(customer_df)
loan_details_df = spark.read.format("delta").load(loan_details_df)

display(customer_df)
display(loan_details_df)


In [0]:
#Get the personal details of the customer who took loan for their wedding and are in rented house 
filtered_loans = loan_details_df.filter((lower(col("home_ownership"))=='rent')&(lower(col('purpose'))=='wedding'))

result_df = filtered_loans.join(customer_df, "customer_id", "inner")

#result_df = result_df.select("customer_id","Customer_Key","city","phone_no","area_code","marital_status","gender","dob","age","email")

display(result_df)

In [0]:
#Calculate Payment Gap Days (the difference in days between consecutive payments).

from pyspark.sql.functions import lag, datediff
from pyspark.sql.window import Window
payment_df = spark.read.format("delta").load("/mnt/Prajwal/Prajwal/Retail_sales_usecase/gold/Fact_Payemnt")
pay_gap = payment_df.withColumn("prev_payment_date", lag(col("payment_date")).over(Window.partitionBy("loan_id").orderBy(col("payment_date"))))
pay_gap = pay_gap.withColumn("payment_gap_days", datediff(col("payment_date"),col("prev_payment_date")))
pay_gap = pay_gap.select("loan_id", "customer_id", "payment_amount", "payment_date", "prev_payment_date", "payment_gap_days")
display(pay_gap)

In [0]:
#Find Customer Profitability. Categorize customers based on loan amounts, payments, penalties, and income.
from pyspark.sql.functions import *
loan_sums = loan_details_df.groupBy("customer_id").agg(sum("current_loan_amount").alias("total_loan_amount"), avg("annual_income").alias("avg_annual_income"))
payment_sums = payment_df.groupBy("loan_id").agg(sum("payment_amount").alias("total_payment_amount"), \
    sum("penalty_amount").alias("total_penalty_amount")).join(loan_details_df.select("loan_id", "customer_id"), on="loan_id").groupBy("customer_id").agg(sum("total_payment_amount").alias("total_payments"), \
    sum("total_penalty_amount").alias("total_penalities"))
profitability_df = loan_sums.join(payment_sums, on='customer_id', how='outer').fillna(0)

profitability_df = profitability_df.withColumn("profitability_score", col("total_payments")-col("total_penalities")+(col("avg_annual_income")*0.1)-(col("total_loan_amount")*0.05))

profitability_df = profitability_df.withColumn("profitability_category", when(col("profitability_score")> 0, "Profitable").otherwise("Unprofitable"))
display(profitability_df)

In [0]:
#Calculate Average Annual Loan Payment. (Consider the number of years the loan has been active.)
loan_payment_summary = payment_df.groupBy("loan_id").agg(sum("payment_amount").alias("total_payments"), min("payment_date").alias("start_date"), max("payment_date").alias("end_date"))

loan_payment_summary = loan_payment_summary.withColumn("years_active", (datediff(col("end_date"), col("start_date"))/lit(365.25)))

loan_payment_summary = loan_payment_summary.withColumn("years_active", when(col("years_active")<1, 1).otherwise(col("years_active")))
average_annual_loan_payment_df = loan_payment_summary.withColumn("avg_annual_payment", col("total_payments")/col("years_active"))

display(average_annual_loan_payment_df)