# Read Silver tables

In [0]:
train_df = spark.table("credit_catalog.silver.train")
test_df = spark.table("credit_catalog.silver.test")

## Combining Training and Test Datasets for Analysis

In [0]:
from pyspark.sql.functions import lit

# Step 0: Add a flag to track source
train_df = train_df.withColumn("data_split", lit("train"))
test_df  = test_df.withColumn("data_split", lit("test"))

# Step 1: Add target column to test data
test_df = test_df.withColumn("Credit_Score", lit(None))

# Step 2: Ensure column order is identical (important)
test_df = test_df.select(train_df.columns)

# Step 3: full_df = train_df.unionByName(test_df)
full_df = train_df.unionByName(test_df)

# Step 4: Register temp view for SQL aggregation
full_df.createOrReplaceTempView("credit_data")


### Analysis of Professions with the Highest Average Loan Amount and Earnings

In [0]:

query = """
    SELECT
        Occupation,
        ROUND(AVG(Monthly_Inhand_Salary),2) AS avg_salary,
        ROUND(AVG(Total_EMI_per_month),2)   AS avg_emi,
        ROUND(AVG(Num_Credit_Card))       AS avg_num_cards,
        ROUND(AVG(Num_of_Loan))           AS avg_num_loans,
        ROUND(AVG(Monthly_Balance),2)       AS avg_balance,
        ROUND(AVG(Amount_invested_monthly),2) AS avg_invested,
        ROUND(AVG(Debt_to_Income_Ratio),2)    AS avg_debt_to_income_ratio,
        ROUND(AVG(EMI_to_Salary_Ratio),2)     AS avg_emi_to_salary_ratio,
        ROUND(AVG(Saving_Capacity),2)         AS avg_saving_capacity
    FROM credit_data
    GROUP BY Occupation
    ORDER BY avg_emi DESC, avg_salary DESC;

"""

occupation_stats = spark.sql(query)

In [0]:
### Writing Gold Table
occupation_stats.write.mode("overwrite").format("delta").saveAsTable("credit_catalog.gold.occupation_stats")
     

### Financial Health by Credit Score

In [0]:

query = """
SELECT
    Credit_Score,
    AVG(EMI_to_Salary_Ratio) AS avg_emi_ratio,
    AVG(Saving_Capacity) AS avg_saving_capacity,
    AVG(Monthly_Balance) AS avg_monthly_balance
FROM credit_data
WHERE Credit_Score IS NOT NULL
GROUP BY Credit_Score;
"""

financial_health = spark.sql(query)


In [0]:
### Writing Gold Table
financial_health.write.mode("overwrite").format("delta").saveAsTable("credit_catalog.gold.financial_health_stat")
     

### Delayed Payments Risk

In [0]:
query = """
SELECT
    Credit_Score,
    AVG(Num_of_Delayed_Payment) AS avg_delays,
    MAX(Num_of_Delayed_Payment) AS max_delays
FROM credit_data
WHERE Credit_Score IS NOT NULL
GROUP BY Credit_Score;
"""

delay_payments_risk = spark.sql(query)


In [0]:
### Writing Gold Table
delay_payments_risk.write.mode("overwrite").format("delta").saveAsTable("credit_catalog.gold.delay_payments_risk_stat")
     

### Loan Type Penetration (%)

In [0]:
query = """
SELECT
    Credit_Score,
    AVG(has_personal_loan) * 100 AS pct_personal_loan,
    AVG(has_payday_loan) * 100 AS pct_payday_loan,
    AVG(has_home_equity_loan) * 100 AS pct_home_equity
FROM credit_data
WHERE Credit_Score IS NOT NULL
GROUP BY Credit_Score;
"""

loan_risk = spark.sql(query)


In [0]:
### Writing Gold Table
loan_risk.write.mode("overwrite").format("delta").saveAsTable("credit_catalog.gold.loan_risk_stat")
     

### Aggregation Dashboard (by Credit Score)

In [0]:

query = """ 
SELECT
    Credit_Score,
    COUNT(*) AS customer_count,
    AVG(Annual_Income) AS avg_income,
    AVG(Monthly_Inhand_Salary) AS avg_salary,
    AVG(EMI_to_Salary_Ratio) AS avg_emi_ratio,
    AVG(Saving_Capacity) AS avg_saving_capacity
FROM credit_data
WHERE Credit_Score IS NOT NULL
GROUP BY Credit_Score;
"""
credit_score_stats = spark.sql(query)

In [0]:
### Writing Gold Table
credit_score_stats.write.mode("overwrite").format("delta").saveAsTable("credit_catalog.gold.credit_score_stats")
     

### Aggregation Dashboard (by Income Group)

In [0]:

query = """ 
SELECT
    Income_Group,

    COUNT(*) AS total_customers,

    ROUND(AVG(Monthly_Balance), 2)               AS avg_saving,
    ROUND(AVG(EMI_to_Salary_Ratio), 3)           AS avg_emi_salary_ratio,
    ROUND(AVG(Num_of_Loan))                   AS avg_number_of_loans,
    ROUND(AVG(Saving_Capacity), 2)               AS avg_saving_capacity,
    ROUND(AVG(Outstanding_Debt), 2)              AS avg_outstanding_debt,
    ROUND(AVG(Total_EMI_per_month), 2)           AS avg_emi_per_month

FROM credit_data
GROUP BY Income_Group
ORDER BY Income_Group;
"""

income_group_stats = spark.sql(query)

In [0]:
### Writing Gold Table
income_group_stats.write.mode("overwrite").format("delta").saveAsTable("credit_catalog.gold.income_group_stats")
     

### Max debt by age

In [0]:
query = """
SELECT
    Age,
    ROUND(MAX(Monthly_Inhand_Salary)) AS max_monthly_salary,
    ROUND(AVG(Saving_Capacity)*100, 2)  AS avg_saving_capacity_percent,
    ROUND(MAX(Outstanding_Debt)) AS max_outstanding_debt
FROM credit_data
GROUP BY Age
ORDER BY Age ASC;
"""
max_debt_and_earning_by_age = spark.sql(query)

max_debt_and_earning_by_age

In [0]:
### Writing Gold Table
max_debt_and_earning_by_age.write.mode("overwrite").format("delta").saveAsTable("credit_catalog.gold.max_debt_and_earning_by_age")
     

### Core Credit Score Ã— Income Group Aggregation

In [0]:
query = """ 
SELECT
    Income_Group,
    Credit_Score,

    COUNT(*) AS total_customers,

    ROUND(AVG(Monthly_Balance), 2)        AS avg_saving,
    ROUND(AVG(Saving_Capacity), 2)        AS avg_saving_capacity,
    ROUND(AVG(EMI_to_Salary_Ratio), 2)    AS avg_emi_salary_ratio,
    ROUND(AVG(Outstanding_Debt), 2)       AS avg_outstanding_debt,
    ROUND(AVG(Total_EMI_per_month), 2)    AS avg_emi_per_month,
    ROUND(AVG(Num_of_Delayed_Payment)) AS avg_delayed_payments

FROM credit_data
WHERE
    Income_Group IS NOT NULL
    AND Credit_Score IS NOT NULL

GROUP BY
    Income_Group,
    Credit_Score

ORDER BY
    Income_Group,
    Credit_Score;

"""
income_group_credit_score_stats = spark.sql(query)
### Writing Gold Table
income_group_credit_score_stats.write.mode("overwrite").format("delta").saveAsTable("credit_catalog.gold.income_group_credit_score_stats")

### 