In [24]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import (VectorAssembler,
                                OneHotEncoder,
                                StringIndexer)


RANDOM_STATE = 87

spark = (SparkSession
         .builder
         .appName("DecisionTreeRegression")
         .getOrCreate())

# load data
def load_housing_data(file_path):
    return pd.read_csv(file_path)

FILE_PATH = "/Users/pepijnschouten/Desktop/Python_Scripts/" \
    "Python_Scripts_Books/Distributed_ML_with_PySpark/" \
        "Python_Own_Files/Chapter 4/data/Housing.csv"
pandas_df = load_housing_data(FILE_PATH)
print(pandas_df.head())

spark_df = spark.createDataFrame(pandas_df)

# prepare data
cat_columns = ["mainroad", "guestroom",
               "basement", "hotwaterheating",
               "airconditioning", "prefarea",
               "furnishingstatus"]

# encode categorical data
indexers = [StringIndexer(inputCol=col,
                          outputCol=col+"_label",
                          handleInvalid="keep")
            for col in cat_columns]
for indexer in indexers:
    spark_df = indexer.fit(spark_df).transform(spark_df)

# one hot encoding
encoder = OneHotEncoder(inputCols=[column+"_label"
                                   for column in cat_columns],
                        outputCols=[column+"_encoded" for column in cat_columns])
spark_df = encoder.fit(spark_df).transform(spark_df)

# combining features in one vector
feature_cols = ["area", "bedrooms", "bathrooms",
                "stories", "parking"] + [col+"_label"
                                         for col
                                         in cat_columns]
                
assembler = VectorAssembler(inputCols=feature_cols,
                            outputCol="features")
spark_df = assembler.transform(spark_df)

# splitting data in train and test
train_data, test_data = spark_df.randomSplit([0.8, 0.2],
                                            seed=RANDOM_STATE)

# model training
dec_tree = DecisionTreeRegressor(featuresCol="features",
                                 labelCol="price")
spark_model = dec_tree.fit(train_data)

# predictions
predictions = spark_model.transform(test_data)

# evaluation
evaluator_rmse = RegressionEvaluator(
    labelCol="price",
    predictionCol="prediction",
    metricName="rmse")
evaluator_r2 = RegressionEvaluator(
    labelCol="price",
    predictionCol="prediction",
    metricName="r2")
rmse = evaluator_rmse.evaluate(predictions)
r2 = evaluator_r2.evaluate(predictions)

# print metrics and info
# string indexing and one hot encoding
spark_df.select("mainroad",
                "mainroad_label",
                "mainroad_encoded").distinct().show()
# show one feature vector
spark_df.select("features").show(1, truncate=False)

# feature importance
feature_importance = spark_model.featureImportances
feature_names = feature_cols
feature_importance_dict = dict(
    zip(feature_names, feature_importance))
sorted_feature_importance = sorted(
    feature_importance_dict.items(),
    key=lambda x: x[1],
    reverse=True)
for feature_names, importance in sorted_feature_importance:
    print(f"{feature_names}: {importance:.3f}")
    
# compare true and predicted values
predictions.select("price", "prediction").show(5)

# print test metrics
print(f"Test RMSE: {rmse:.2f}")
print(f"Test R2: {r2:.2f}")

# print train metrics
train_predictions = spark_model.transform(train_data)
evaluator_r2 = RegressionEvaluator(
    labelCol="price",
    predictionCol="prediction",
    metricName="r2")
train_r2 = evaluator_r2.evaluate(train_predictions)
print(f"Train R2: {train_r2:.2f}")





      price  area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0  13300000  7420         4          2        3      yes        no       no   
1  12250000  8960         4          4        4      yes        no       no   
2  12250000  9960         3          2        2      yes        no      yes   
3  12215000  7500         4          2        2      yes        no      yes   
4  11410000  7420         4          1        2      yes       yes      yes   

  hotwaterheating airconditioning  parking prefarea furnishingstatus  
0              no             yes        2      yes        furnished  
1              no             yes        3       no        furnished  
2              no              no        2      yes   semi-furnished  
3              no             yes        3      yes        furnished  
4              no             yes        2       no        furnished  
+--------+--------------+----------------+
|mainroad|mainroad_label|mainroad_encoded|
+--------+---