In [None]:
%python
!pip install kagglehub

You should consider upgrading via the '/local_disk0/.ephemeral_nfs/envs/pythonEnv-d17efb68-69d4-4653-8d22-d76944ded0ee/bin/python -m pip install --upgrade pip' command.[0m


In [None]:
%python
import kagglehub
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import Imputer, StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.regression import RandomForestRegressor, DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
# Download latest version
path = kagglehub.dataset_download("mustafakeser4/bigquery-fintech-dataset")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/mustafakeser4/bigquery-fintech-dataset/versions/1


In [None]:
%python
customer_df = spark.read.csv(
    "file:///root/.cache/kagglehub/datasets/mustafakeser4/bigquery-fintech-dataset/versions/1/customer.csv",
    header=True,       # Set to True if the CSV has a header
    inferSchema=True,  # Prevent automatic type inference if fields are irregular
    multiLine=True,     # Support multi-line fields
    quote='"',          # Handle quoted fields properly
    escape='"',         # Escape quotes inside quoted fields
    sep=","             # Specify comma as the delimiter
)
loan_df = spark.read.csv(
    "file:///root/.cache/kagglehub/datasets/mustafakeser4/bigquery-fintech-dataset/versions/1/loan.csv",
    header=True,       # Set to True if the CSV has a header
    inferSchema=True,  # Prevent automatic type inference if fields are irregular
    multiLine=True,     # Support multi-line fields
    quote='"',          # Handle quoted fields properly
    escape='"',         # Escape quotes inside quoted fields
    sep=","             # Specify comma as the delimiter
)

In [None]:
data2 = customer_df.join(loan_df, customer_df.customer_id == loan_df.customer_id).drop("customer_id").drop("loan_id").drop("emp_title").drop("emp_length").drop("grade").drop("description").drop("notes")
data2.printSchema()

root
 |-- home_ownership: string (nullable = true)
 |-- annual_inc: double (nullable = true)
 |-- annual_inc_joint: double (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- zip_code: string (nullable = true)
 |-- addr_state: string (nullable = true)
 |-- avg_cur_bal: double (nullable = true)
 |-- Tot_cur_bal: double (nullable = true)
 |-- loan_status: string (nullable = true)
 |-- loan_amount: double (nullable = true)
 |-- state: string (nullable = true)
 |-- funded_amount: double (nullable = true)
 |-- term: string (nullable = true)
 |-- int_rate: double (nullable = true)
 |-- installment: double (nullable = true)
 |-- issue_d: string (nullable = true)
 |-- issue_date: string (nullable = true)
 |-- issue_year: double (nullable = true)
 |-- pymnt_plan: boolean (nullable = true)
 |-- type: string (nullable = true)
 |-- purpose: string (nullable = true)



In [None]:
data = customer_df.join(loan_df, customer_df.customer_id == loan_df.customer_id).drop("customer_id").drop("loan_id").drop("emp_title").drop("emp_length").drop("grade").drop("description").drop("notes")

# Fill missing values
imputer = Imputer(inputCols=["annual_inc", "annual_inc_joint", "avg_cur_bal", "Tot_cur_bal", "loan_amount", "int_rate", "installment"],
                  outputCols=["annual_inc", "annual_inc_joint", "avg_cur_bal", "Tot_cur_bal", "loan_amount", "int_rate", "installment"])
data = imputer.fit(data).transform(data)

# StringIndexer for categorical variables
indexers = [
    StringIndexer(inputCol=col, outputCol=f"{col}_index", handleInvalid="skip").fit(data)
    for col in [ "home_ownership", "verification_status", "state",  "purpose"]
]
for indexer in indexers:
    data = indexer.transform(data)

# OneHotEncoder for indexed columns
encoder = OneHotEncoder(inputCols=[f"{col}_index" for col in ["home_ownership", "verification_status", "state",  "purpose"]],
                        outputCols=[f"{col}_vec" for col in ["home_ownership", "verification_status", "state",  "purpose"]])
data = encoder.fit(data).transform(data)

# Assemble features
assembler = VectorAssembler(
    inputCols=["annual_inc", "annual_inc_joint", "avg_cur_bal", "Tot_cur_bal", "loan_amount", "int_rate", "installment"] +
              [f"{col}_vec" for col in ["home_ownership", "verification_status", "state",  "purpose"]],
    outputCol="features", handleInvalid="skip"
)
data = assembler.transform(data)



# Save the model


Root Mean Squared Error (RMSE): 312.02956062108615
Mean Absolute Error (MAE): 134.2521759515924


In [None]:
# Select final dataset for training and testing
data = data.select("features", col("funded_amount").alias("label"))

# Split data into training and testing sets
train_data, test_data = data.randomSplit([0.7, 0.3])

# Define Random Forest Regressor
rf = DecisionTreeRegressor(featuresCol="features", labelCol="label", maxDepth=20)

# Train the model
rf_model = rf.fit(train_data)

# Make predictions
predictions = rf_model.transform(test_data)

# Evaluate model performance
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse}")

# Evaluate Mean Absolute Error (MAE)
mae_evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="mae")
mae = mae_evaluator.evaluate(predictions)
print(f"Mean Absolute Error (MAE): {mae}")
import time
rf_model.save(f"decision_tree_model_{time.time()}")

In [None]:
rf_model.toDebugString

Out[15]: 'DecisionTreeRegressionModel: uid=DecisionTreeRegressor_2115307f5b0c, depth=10, numNodes=1681, numFeatures=76\n  If (feature 4 <= 18487.5)\n   If (feature 4 <= 9987.5)\n    If (feature 4 <= 5987.5)\n     If (feature 4 <= 3737.5)\n      If (feature 4 <= 2850.0)\n       If (feature 6 <= 92.91)\n        If (feature 5 <= 0.20525)\n         If (feature 67 in {1.0})\n          If (feature 5 <= 0.11005000000000001)\n           If (feature 0 <= 71040.0)\n            Predict: 1819.7519083969466\n           Else (feature 0 > 71040.0)\n            Predict: 2039.2045454545455\n          Else (feature 5 > 0.11005000000000001)\n           If (feature 0 <= 24131.0)\n            Predict: 1616.2735849056603\n           Else (feature 0 > 24131.0)\n            Predict: 1733.816837315131\n         Else (feature 67 not in {1.0})\n          If (feature 5 <= 0.11005000000000001)\n           If (feature 69 in {1.0})\n            Predict: 1817.6315789473683\n           Else (feature 69 not in {1.0})\n