In [4]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.feature import (StringIndexer,
                                OneHotEncoder,
                                VectorAssembler)
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from collections import Counter

RANDOM_STATE = 87

file_dir = "/Users/pepijnschouten/Desktop/Python_Scripts/" \
    "Python_Scripts_Books/Distributed_ML_with_PySpark/" \
        "Python_Own_Files/Chapter 5/data/Housing.csv"
pandas_df = pd.read_csv(file_dir)

# create dataframe
spark = (SparkSession
         .builder
         .appName("Random forest regression")
         .getOrCreate())

pyspark_df = spark.createDataFrame(pandas_df)

cat_cols = ["mainroad", "guestroom", "basement",
            "hotwaterheating", "airconditioning",
            "prefarea", "furnishingstatus"]

# index string values
indexers = [StringIndexer(inputCol=col,
                          outputCol=col+"_label",
                          handleInvalid="keep")
            for col in cat_cols]

for indexer in indexers:
    pyspark_df = indexer.fit(pyspark_df).transform(pyspark_df)

# encode categorical values
encoder = OneHotEncoder(inputCols=[column+"_label"
                                   for column in cat_cols],
                        outputCols=[column+"_encoded"
                                    for column in cat_cols])

pyspark_df = encoder.fit(pyspark_df).transform(pyspark_df)

feature_cols = ["area", "bedrooms", "bathrooms",
                "stories", "parking"] + [col+"_label"
                                         for col
                                         in cat_cols]

# vector assembler
assembler = VectorAssembler(inputCols=feature_cols,
                            outputCol="features")

pyspark_df = assembler.transform(pyspark_df)

# train test split
train_data, test_data = pyspark_df.randomSplit([0.8, 0.2],
                                                seed=RANDOM_STATE)

# random forrest regressor
rf = RandomForestRegressor(featuresCol="features",
                           labelCol="price")

# train the model
spark_model = rf.fit(train_data)

# make predictions
predictions = spark_model.transform(test_data)

# evaluating the model
evaluator_rmse = RegressionEvaluator(labelCol="price",
                                     predictionCol="prediction",
                                     metricName="rmse")
evaluator_r2 = RegressionEvaluator(labelCol="price",
                                   predictionCol="prediction",
                                   metricName="r2")

rmse = evaluator_rmse.evaluate(predictions)
r2 = evaluator_r2.evaluate(predictions)

# print info and metrics
tree_depth = []
for tree in spark_model.trees:
    tree_depth.append(tree.depth)
depth_counter = Counter(tree_depth)
print("Tree Depths and Counts:")
for depth, count in depth_counter.items():
    print(f"Depth: {depth}: {count} trees")

# feature importances
feature_importances = spark_model.featureImportances
feature_names = feature_cols
feature_importances_tuples = list(
    zip(feature_names, feature_importances))
sorted_feature_importance = sorted(
    feature_importances_tuples,
    key=lambda x: x[1],
    reverse=True)
print("Feature importances:")
for feature, importance in sorted_feature_importance:
    print(f"Feature '{feature}': {importance:.3f}")
    
# compare some trua and predicted values
predictions.select("price", "prediction").show(5)

# print metrics
print(f"RMSE: {rmse} | R2: {r2}")

Tree Depths and Counts:
Depth: 5: 20 trees
Feature importances:
Feature 'area': 0.370
Feature 'bathrooms': 0.174
Feature 'airconditioning_label': 0.095
Feature 'parking': 0.075
Feature 'bedrooms': 0.072
Feature 'prefarea_label': 0.068
Feature 'stories': 0.064
Feature 'furnishingstatus_label': 0.038
Feature 'guestroom_label': 0.017
Feature 'basement_label': 0.014
Feature 'mainroad_label': 0.011
Feature 'hotwaterheating_label': 0.003
+-------+-----------------+
|  price|       prediction|
+-------+-----------------+
|7000000|5988444.238830279|
|7210000|7060078.129245688|
|7350000|6210159.215437459|
|7560000|6343784.293045112|
|8120000|6451554.632793765|
+-------+-----------------+
only showing top 5 rows

RMSE: 1160452.4699666426 | R2: 0.6211051478228472
