In [0]:
# Read CSV from FileStore
file_path = "/FileStore/shared_uploads/azuser3611_mml.local@techademy.com/accepted_2007_to_2018Q4.csv"

df_bronze = spark.read \
    .option("header", True) \
    .option("inferSchema", True) \
    .csv(file_path)

# Save to Bronze Delta
df_bronze.write \
    .format("delta") \
    .mode("overwrite") \
    .save("/FileStore/lending/bronze/delta")

# Preview data
df_bronze.show(5)


+--------+---------+---------+-----------+---------------+----------+--------+-----------+-----+---------+--------------------+----------+--------------+----------+-------------------+--------+-----------+----------+--------------------+----+------------------+------------------+--------+----------+-----+-----------+----------------+--------------+---------------+--------------+----------------------+----------------------+--------+-------+---------+----------+---------+-------------------+---------+-------------+------------------+---------------+---------------+-------------+------------------+----------+-----------------------+------------+---------------+------------+------------------+--------------------+-------------------+--------------------------+---------------------------+-----------+----------------+----------------+---------+-------------------------+--------------+------------+-----------+-----------+-----------+-----------+-----------+------------------+------------+---

In [0]:
# Preview data
df_bronze.show(5)

+--------+---------+---------+-----------+---------------+----------+--------+-----------+-----+---------+--------------------+----------+--------------+----------+-------------------+--------+-----------+----------+--------------------+----+------------------+------------------+--------+----------+-----+-----------+----------------+--------------+---------------+--------------+----------------------+----------------------+--------+-------+---------+----------+---------+-------------------+---------+-------------+------------------+---------------+---------------+-------------+------------------+----------+-----------------------+------------+---------------+------------+------------------+--------------------+-------------------+--------------------------+---------------------------+-----------+----------------+----------------+---------+-------------------------+--------------+------------+-----------+-----------+-----------+-----------+-----------+------------------+------------+---

In [0]:
# SILVER LAYER FULL CODE
from pyspark.sql.functions import col, regexp_replace

#Read from Bronze Delta Table
df_bronze = spark.read.format("delta").load("/FileStore/lending/bronze/delta")

#Clean & Transform
df_silver = df_bronze.dropna(subset=["loan_amnt", "term", "int_rate"]) \
    .withColumn("int_rate", regexp_replace("int_rate", "%", "").cast("double")) \
    .withColumn("loan_amnt", col("loan_amnt").cast("double")) \
    .withColumn("annual_inc", col("annual_inc").cast("double")) \
    .withColumn("dti", col("dti").cast("double")) \
    .withColumn("emp_length", regexp_replace("emp_length", "\\+ years| years|< 1 year", "")) \
    .withColumn("emp_length", regexp_replace("emp_length", "n/a", "0")) \
    .withColumn("emp_length", col("emp_length").cast("int"))

#Save as Delta Table in Silver Layer
df_silver.write.format("delta").mode("overwrite").save("/FileStore/lending/silver/delta")

#Preview Transformed Data
df_silver.show(5)


+-------+---------+---------+-----------+---------------+----------+--------+-----------+-----+---------+--------------------+----------+--------------+----------+-------------------+--------+-----------+----------+--------------------+--------------------+------------------+------------------+--------+----------+-----+-----------+----------------+--------------+---------------+--------------+----------------------+----------------------+--------+-------+---------+----------+---------+-------------------+---------+-------------+------------------+---------------+---------------+-------------+------------------+----------+-----------------------+------------+---------------+------------+------------------+--------------------+-------------------+--------------------------+---------------------------+-----------+----------------+----------------+---------+-------------------------+--------------+------------+-----------+-----------+-----------+-----------+-----------+------------------+-

In [0]:
from pyspark.sql.functions import avg, count, col

# Read Silver Delta Table
df_silver = spark.read.format("delta").load("/FileStore/lending/silver/delta")

# Aggregation Example 1 – Loan amount & interest rate by grade
loan_summary_by_grade = df_silver.groupBy("grade").agg(
    avg("loan_amnt").alias("avg_loan_amount"),
    avg("int_rate").alias("avg_interest_rate")
)

# Save to Gold Layer
loan_summary_by_grade.write.format("delta").mode("overwrite").save("/FileStore/lending/gold/loan_summary_by_grade")

# Aggregation Example 2 – Default count by loan status
loan_status_counts = df_silver.groupBy("loan_status").count()

# Save to Gold Layer
loan_status_counts.write.format("delta").mode("overwrite").save("/FileStore/lending/gold/loan_status_counts")

# Aggregation Example 3 – Average DTI by home ownership
dti_by_home = df_silver.groupBy("home_ownership").agg(
    avg("dti").alias("avg_dti")
)

# Save to Gold Layer
dti_by_home.write.format("delta").mode("overwrite").save("/FileStore/lending/gold/dti_by_home_ownership")

# Step 5: Show the first results
print("Loan Summary by Grade:")
loan_summary_by_grade.show()

print("Loan Status Counts:")
loan_status_counts.show()

print("Average DTI by Home Ownership:")
dti_by_home.show()


Loan Summary by Grade:
+-----+------------------+------------------+
|grade|   avg_loan_amount| avg_interest_rate|
+-----+------------------+------------------+
|    F| 19124.64653110048| 25.45409066985592|
|    E|17453.078391907933| 21.82965253356323|
|    B| 14173.33819852703| 10.67580623818657|
|    D| 15711.98300680591|18.143067189846647|
|    C|15038.083317821778|14.143688622323502|
|    A|14603.343209545825| 7.084545374764227|
|    G|20383.988740959896|28.074255424062848|
+-----+------------------+------------------+

Loan Status Counts:
+--------------------+-------+
|         loan_status|  count|
+--------------------+-------+
|          Fully Paid|1076751|
|             Default|     40|
|     In Grace Period|   8436|
|Does not meet the...|   1988|
|         Charged Off| 268558|
|  Late (31-120 days)|  21467|
|             Current| 878317|
|Does not meet the...|    761|
|   Late (16-30 days)|   4349|
|            Oct-2015|      1|
+--------------------+-------+

Average DTI by 