In [8]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder

RANDOM_STATE = 87

file_dir = "/Users/pepijnschouten/Desktop/Python_Scripts/" \
    "Python_Scripts_Books/Distributed_ML_with_PySpark/" \
        "Python_Own_Files/Chapter 6/data/Housing.csv"
pandas_df = pd.read_csv(file_dir)

# load data
X = pandas_df.drop("price", axis=1)
y = pandas_df["price"]

cat_cols = ["mainroad", "guestroom", "basement",
            "hotwaterheating", "airconditioning",
            "prefarea", "furnishingstatus"]
onehot_encoder = OneHotEncoder(sparse_output=False)
X_encoder = onehot_encoder.fit_transform(X[cat_cols])
x_encoded_df = pd.DataFrame(X_encoder,
                            columns=onehot_encoder.get_feature_names_out(cat_cols))
X = X.drop(cat_cols, axis=1)
X = pd.concat([X, x_encoded_df], axis=1)

# split in train and test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE)

# train the GBT model
gbt_regressor = GradientBoostingRegressor()
gbt_regressor.fit(X_train, y_train)

# predictions
y_pred = gbt_regressor.predict(X_test)

# evaluation
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# print information and metrics
print(f"# estimators: {gbt_regressor.n_estimators}")
print(f"Max. depth: {gbt_regressor.max_depth}")

# feature importance
importances = gbt_regressor.feature_importances_
feature_names = X.columns
indices = np.argsort(importances)[::-1]
print("Feature importances:")
for i in indices:
    print(f"{feature_names[i]}: {importances[i]:.3f}")
    
# compare true and predicted values
results_df = pd.DataFrame(
    {"Price": y_test, "Prediction": y_pred})
print(results_df.head())

# print metrics
print(f"RMSE: {rmse:.2f}")
print(f"R2: {r2:.2f}")



# estimators: 100
Max. depth: 3
Feature importances:
area: 0.467
bathrooms: 0.166
airconditioning_no: 0.075
stories: 0.057
bedrooms: 0.041
parking: 0.040
furnishingstatus_unfurnished: 0.034
airconditioning_yes: 0.032
prefarea_no: 0.017
basement_no: 0.010
basement_yes: 0.009
hotwaterheating_no: 0.009
furnishingstatus_furnished: 0.008
prefarea_yes: 0.008
guestroom_yes: 0.006
mainroad_no: 0.006
guestroom_no: 0.005
mainroad_yes: 0.004
hotwaterheating_yes: 0.003
furnishingstatus_semi-furnished: 0.003
        Price    Prediction
220   4795000  7.738002e+06
469   3010000  3.696516e+06
534   2100000  2.933966e+06
10    9800000  6.762227e+06
2    12250000  6.618916e+06
RMSE: 1228098.74
R2: 0.56
