In [None]:
from pyspark.sql.functions import col, trim, when
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql import SparkSession
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import Imputer, StringIndexer, OneHotEncoder, VectorAssembler

In [None]:
spark = SparkSession.builder.appName("Regression").config("spark.driver.memory", "8g").config("spark.executor.memory", "16g").getOrCreate()

In [None]:
data = spark.read.csv(
    "/home/chun/Downloads/cs-267/project/join_df.csv",
    header=True,       # Set to True if the CSV has a header
    inferSchema=True,  # Prevent automatic type inference if fields are irregular
    multiLine=True,     # Support multi-line fields
    quote='"',          # Handle quoted fields properly
    escape='"',         # Escape quotes inside quoted fields
    sep=","             # Specify comma as the delimiter
).drop("loan_amount").drop("grade")

In [None]:
data.printSchema()

root
 |-- emp_title: string (nullable = true)
 |-- emp_length: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_inc: double (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- addr_state: string (nullable = true)
 |-- avg_cur_bal: integer (nullable = true)
 |-- Tot_cur_bal: integer (nullable = true)
 |-- loan_status: string (nullable = true)
 |-- loan_amount: integer (nullable = true)
 |-- state: string (nullable = true)
 |-- funded_amount: integer (nullable = true)
 |-- term: string (nullable = true)
 |-- int_rate: double (nullable = true)
 |-- installment: double (nullable = true)
 |-- grade: string (nullable = true)
 |-- pymnt_plan: boolean (nullable = true)
 |-- type: string (nullable = true)
 |-- purpose: string (nullable = true)
 |-- description: string (nullable = true)



In [None]:
categorical_variables  = [col for col, dtype in join_df.dtypes if dtype == 'string']
categorical_variables

['emp_title',
 'emp_length',
 'home_ownership',
 'verification_status',
 'addr_state',
 'loan_status',
 'state',
 'term',
 'grade',
 'type',
 'purpose',
 'description']

In [None]:
# Fill missing values
imputer = Imputer(inputCols=["annual_inc",  "avg_cur_bal", "Tot_cur_bal", "int_rate", "installment"],
                  outputCols=["annual_inc",  "avg_cur_bal", "Tot_cur_bal",  "int_rate", "installment"])
data = imputer.fit(data).transform(data)

# StringIndexer for categorical variables
indexers = [
    StringIndexer(inputCol=col, outputCol=f"{col}_index", handleInvalid="skip").fit(data)
    for col in [ "home_ownership", "verification_status", "state",  "purpose"]
]
for indexer in indexers:
    data = indexer.transform(data)

# OneHotEncoder for indexed columns
encoder = OneHotEncoder(inputCols=[f"{col}_index" for col in ["home_ownership", "verification_status", "state",  "purpose"]],
                        outputCols=[f"{col}_vec" for col in ["home_ownership", "verification_status", "state",  "purpose"]])
data = encoder.fit(data).transform(data)
print(len(data.columns))
# Assemble features
assembler = VectorAssembler(
    inputCols=["annual_inc", "avg_cur_bal", "Tot_cur_bal", "int_rate", "installment"] +
              [f"{col}_vec" for col in ["home_ownership", "verification_status", "state",  "purpose"]],
    outputCol="features", handleInvalid="skip"
)
data = assembler.transform(data)

# Select final dataset for training and testing
data = data.select("features", col("funded_amount").alias("label"))

<built-in function len>


In [None]:
# Split data into training and testing sets
train_data, test_data = data.randomSplit([0.7, 0.3])

# Define Random Forest Regressor
rf = RandomForestRegressor(featuresCol="features", labelCol="label", numTrees=10, maxDepth=10)

# Train the model
rf_model = rf.fit(train_data)

# Make predictions
predictions = rf_model.transform(test_data)

# Evaluate model performance
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse}")

# Evaluate Mean Absolute Error (MAE)
mae_evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="mae")
mae = mae_evaluator.evaluate(predictions)
print(f"Mean Absolute Error (MAE): {mae}")

# Save the model
import time
rf_model.save(f"random_forest_model_{time.time()}")

24/12/05 15:22:30 WARN DAGScheduler: Broadcasting large task binary with size 1271.8 KiB


Root Mean Squared Error (RMSE): 3019.234473675644
Mean Absolute Error (MAE): 2249.966856975659
