In [0]:
#Problem Statement
#A bank wants to predict whether a customer will repay a loan. They have the following customer details:

#Age: Age of the customer.
#Monthly_Income: Monthly income of the customer.
#Loan_Amount: Total loan amount requested.
#Credit_Score: A score indicating creditworthiness (0–1000).
#Employment_Years: Number of years the customer has been employed.
#The target variable is: Loan_Default (Yes/No): Whether the customer defaults on the loan

In [0]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer   #Most machine learning algorithms (including Decision Trees and Random Forests) cannot directly handle categorical strings like "Yes" or "No". They work on numeric data, so categorical values must be transformed into numbers.
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Step 1: Load the data into PySpark DataFrame

# Assuming the data is loaded into a DataFrame (e.g., from a CSV or database)
data_df = spark.sql("SELECT * FROM loan_default_data")
data_df.show()
# Ensure numeric data types for the features
from pyspark.sql.functions import col
data_df = data_df.withColumn("Age", col("Age").cast("double")) \
                 .withColumn("Monthly_Income", col("Monthly_Income").cast("double")) \
                 .withColumn("Loan_Amount", col("Loan_Amount").cast("double")) \
                 .withColumn("Credit_Score", col("Credit_Score").cast("double")) \
                 .withColumn("Employment_Years", col("Employment_Years").cast("double"))

# Convert the target variable to numeric using StringIndexer
indexer = StringIndexer(inputCol="Loan_Default", outputCol="label")
data_df = indexer.fit(data_df).transform(data_df)

data_df.printSchema()

+---+--------------+-----------+------------+----------------+------------+
|Age|Monthly_Income|Loan_Amount|Credit_Score|Employment_Years|Loan_Default|
+---+--------------+-----------+------------+----------------+------------+
| 58|         12485|       5037|         555|              28|          No|
| 25|         13274|      44252|         377|              34|         Yes|
| 19|         11125|      26243|         753|              19|         Yes|
| 65|          5598|      37021|         863|              39|          No|
| 35|         13216|       6276|         400|              33|         Yes|
| 33|          7313|      12331|         351|               0|          No|
| 32|         14588|      28788|         967|              35|         Yes|
| 26|         14713|      25153|         853|              19|         Yes|
| 65|          2916|      20692|         315|               6|          No|
| 24|          5752|       8796|         395|               8|          No|
| 61|       

In [0]:

# Step 2: Prepare the data
feature_columns = ["Age", "Monthly_Income", "Loan_Amount", "Credit_Score", "Employment_Years"]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data_prepared = assembler.transform(data_df).select("features", "label")
data_df.show()

# Step 3: Split the data into training and test sets
train_data, test_data = data_prepared.randomSplit([0.8, 0.2], seed=42)

+----+--------------+-----------+------------+----------------+------------+-----+
| Age|Monthly_Income|Loan_Amount|Credit_Score|Employment_Years|Loan_Default|label|
+----+--------------+-----------+------------+----------------+------------+-----+
|58.0|       12485.0|     5037.0|       555.0|            28.0|          No|  0.0|
|25.0|       13274.0|    44252.0|       377.0|            34.0|         Yes|  1.0|
|19.0|       11125.0|    26243.0|       753.0|            19.0|         Yes|  1.0|
|65.0|        5598.0|    37021.0|       863.0|            39.0|          No|  0.0|
|35.0|       13216.0|     6276.0|       400.0|            33.0|         Yes|  1.0|
|33.0|        7313.0|    12331.0|       351.0|             0.0|          No|  0.0|
|32.0|       14588.0|    28788.0|       967.0|            35.0|         Yes|  1.0|
|26.0|       14713.0|    25153.0|       853.0|            19.0|         Yes|  1.0|
|65.0|        2916.0|    20692.0|       315.0|             6.0|          No|  0.0|
|24.

In [0]:

# Step 4a: Train the Decision Tree Classifier
dt = DecisionTreeClassifier(featuresCol="features", labelCol="label")
dt_model = dt.fit(train_data)

# Output Decision Tree model details
print("Decision Tree Model:")
print(f"  Depth: {dt_model.depth}")
print(f"  Number of Nodes: {dt_model.numNodes}")

# print(f"  Depth: {dt_model.depth}") Purpose: Prints the depth of the trained Decision Tree model.
#What is Depth?
#The depth of a decision tree is the number of levels in the tree.
#The root node is at depth 0, and the maximum depth of the tree is the length of the longest path from the root to a leaf node.
#A deeper tree can model more complex patterns but is also more prone to overfitting.

#print(f"  Number of Nodes: {dt_model.numNodes}") Purpose: Prints the total number of nodes in the trained Decision Tree model.
#What are Nodes?
#Nodes in a decision tree represent decision points based on feature values.
#Includes:
#Decision nodes: Where the model makes a split.
#Leaf nodes: Terminal nodes where predictions are made.
#Implication:
#More nodes generally indicate a more complex tree.
#A large number of nodes can increase model complexity and risk of overfitting.


Decision Tree Model:
  Depth: 5
  Number of Nodes: 15


In [0]:

# Step 4b: Train the Random Forest Classifier
rf = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=100)
rf_model = rf.fit(train_data)

# Output Random Forest model details
print("Random Forest Model:")
print(f"  Number of Trees: {rf_model.getNumTrees}")

Random Forest Model:
  Number of Trees: 100


In [0]:

# Step 5: Make predictions
# Decision Tree predictions
dt_predictions = dt_model.transform(test_data)
dt_predictions.select("features", "label", "prediction").show()

# Random Forest predictions
rf_predictions = rf_model.transform(test_data)
rf_predictions.select("features", "label", "prediction").show()

+--------------------+-----+----------+
|            features|label|prediction|
+--------------------+-----+----------+
|[19.0,12532.0,223...|  0.0|       0.0|
|[20.0,14302.0,359...|  0.0|       1.0|
|[22.0,12468.0,121...|  1.0|       1.0|
|[23.0,9668.0,3869...|  1.0|       1.0|
|[25.0,13274.0,442...|  1.0|       0.0|
|[28.0,5258.0,1746...|  0.0|       1.0|
|[31.0,10085.0,271...|  1.0|       1.0|
|[32.0,14592.0,359...|  0.0|       1.0|
|[36.0,6119.0,2325...|  1.0|       0.0|
|[36.0,12501.0,316...|  1.0|       1.0|
|[39.0,3489.0,6378...|  0.0|       1.0|
|[40.0,4621.0,5471...|  0.0|       1.0|
|[40.0,12280.0,194...|  1.0|       1.0|
|[42.0,3796.0,2008...|  0.0|       0.0|
|[46.0,6304.0,4901...|  1.0|       1.0|
|[52.0,8572.0,1016...|  1.0|       1.0|
|[56.0,10317.0,212...|  0.0|       1.0|
+--------------------+-----+----------+

+--------------------+-----+----------+
|            features|label|prediction|
+--------------------+-----+----------+
|[19.0,12532.0,223...|  0.0|       0.0|

In [0]:

# Step 6: Evaluate the models
evaluator = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="accuracy"
)

# Evaluate Decision Tree
dt_accuracy = evaluator.evaluate(dt_predictions)
print("Decision Tree Model Performance:")
print(f"  Accuracy: {dt_accuracy}")

# Evaluate Random Forest
rf_accuracy = evaluator.evaluate(rf_predictions)
print("Random Forest Model Performance:")
print(f"  Accuracy: {rf_accuracy}")


# Conclusion
print("Summary of Results:")
print("Decision Tree:")
print(f"  Accuracy: {dt_accuracy}")
print("Random Forest:")
print(f"  Accuracy: {rf_accuracy}")

Decision Tree Model Performance:
  Accuracy: 0.5294117647058824
Random Forest Model Performance:
  Accuracy: 0.5882352941176471
Summary of Results:
Decision Tree:
  Accuracy: 0.5294117647058824
Random Forest:
  Accuracy: 0.5882352941176471


In [0]:
#Decision Tree Model Performance: Achieved an accuracy of 52.94%, indicating correct predictions for roughly half the test dataset.
#Random Forest Model Performance: Achieved an accuracy of 58.82%, showing improved prediction reliability due to ensemble learning.
#Summary of Results: Random Forest outperformed Decision Tree by approximately 5.88% in accuracy, highlighting the benefits of combining multiple trees.