In [7]:
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import (
    VectorAssembler, StringIndexer, OneHotEncoder)

RANDOM_STATE = 87

file_dir = "/Users/pepijnschouten/Desktop/Python_Scripts/" \
    "Python_Scripts_Books/Distributed_ML_with_PySpark/" \
        "Python_Own_Files/Chapter 6/data/Housing.csv"
pandas_df = pd.read_csv(file_dir)

spark = (SparkSession
         .builder
         .appName("GBTRegression")
         .getOrCreate())

pyspark_df = spark.createDataFrame(pandas_df)


cat_cols = ["mainroad", "guestroom", "basement",
            "hotwaterheating", "airconditioning",
            "prefarea", "furnishingstatus"]
indexers = [StringIndexer(inputCol=col,
                          outputCol=col+"_label",
                          handleInvalid="keep")
            for col in cat_cols]

for indexer in indexers:
    pyspark_df = indexer.fit(pyspark_df).transform(pyspark_df)

# encode categorical values
encoder = OneHotEncoder(inputCols=[column+"_label"
                                   for column in cat_cols],
                        outputCols=[column+"_encoded"
                                    for column in cat_cols])

pyspark_df = encoder.fit(pyspark_df).transform(pyspark_df)

feature_cols = ["area", "bedrooms", "bathrooms",
                "stories", "parking"] + [col+"_label"
                                         for col
                                         in cat_cols]

# vector assembler
assembler = VectorAssembler(inputCols=feature_cols,
                            outputCol="features")

pyspark_df = assembler.transform(pyspark_df)

# train test split
train_data, test_data = pyspark_df.randomSplit([0.8, 0.2],
                                                seed=RANDOM_STATE)

gbt_model = GBTRegressor(featuresCol="features",
                         labelCol="price")
spark_model = gbt_model.fit(train_data)

predictions = spark_model.transform(test_data)

evaluator_rmse = RegressionEvaluator(labelCol="price",
                                     predictionCol="prediction",
                                     metricName="rmse")
rmse = evaluator_rmse.evaluate(predictions)

evaluator_r2 = RegressionEvaluator(labelCol="price",
                                   predictionCol="prediction",
                                   metricName="r2")
r2 = evaluator_r2.evaluate(predictions)

# number of trees and depth
num_trees = spark_model.getNumTrees
max_depth = spark_model.getOrDefault("maxDepth")
print(f"Num. trees: {num_trees}")
print(f"Max depth: {max_depth}")

# feature importances
feature_importances = spark_model.featureImportances
feature_names = feature_cols
feature_importances_tuples = list(zip(feature_names, feature_importances))
sorted_feature_importance = sorted(feature_importances_tuples,
                                  key=lambda x: x[1],
                                  reverse=True)
for feature, importance in sorted_feature_importance:
    print(f"Feature {feature}: {importance:.3f}")

# compare true and predicted values
predictions.select("price", "prediction").show(5)

print(f"Test RMSE: {rmse:.2f}")
print(f"Test R2: {r2:.2f}")

Num. trees: 20
Max depth: 5
Feature area: 0.367
Feature bathrooms: 0.109
Feature parking: 0.083
Feature airconditioning_label: 0.076
Feature stories: 0.065
Feature bedrooms: 0.062
Feature guestroom_label: 0.051
Feature mainroad_label: 0.044
Feature furnishingstatus_label: 0.044
Feature prefarea_label: 0.043
Feature basement_label: 0.033
Feature hotwaterheating_label: 0.024
+-------+--------------------+
|  price|          prediction|
+-------+--------------------+
|7000000|   7257448.734636264|
|7210000|    7458939.22333838|
|7350000|   5298164.855151334|
|7560000|   6526843.342434246|
|8120000|1.1402739360943178E7|
+-------+--------------------+
only showing top 5 rows

Test RMSE: 1546198.65
Test R2: 0.33


24/10/26 09:46:31 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
