In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, root_mean_squared_error
from sklearn.preprocessing import OneHotEncoder

RANDOM_STATE = 87

file_dir = "/Users/pepijnschouten/Desktop/Python_Scripts/" \
    "Python_Scripts_Books/Distributed_ML_with_PySpark/" \
        "Python_Own_Files/Chapter 5/data/Housing.csv"
pandas_df = pd.read_csv(file_dir)

# load data
X = pandas_df.drop("price", axis=1)
y = pandas_df["price"]

cat_cols = ["mainroad", "guestroom", "basement",
            "hotwaterheating", "airconditioning",
            "prefarea", "furnishingstatus"]
onehot_encoder = OneHotEncoder(sparse_output=False)
X_encoder = onehot_encoder.fit_transform(X[cat_cols])
x_encoded_df = pd.DataFrame(X_encoder,
                            columns=onehot_encoder.get_feature_names_out(cat_cols))
X = X.drop(cat_cols, axis=1)
X = pd.concat([X, x_encoded_df], axis=1)

# split in train and test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE)

# train the model
random_forrest_model = RandomForestRegressor()
random_forrest_model.fit(X_train, y_train)

# predictions
y_pred = random_forrest_model.predict(X_test)

# evaluation
r2 = r2_score(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)

# print information
print(f"Number of trees: {random_forrest_model.n_estimators}")

# feature importances
print("Feature importances:")
for feature, importance in zip(X.columns,
                               sorted(
                                   random_forrest_model.feature_importances_,
                                   reverse=True)):
    print(f"{feature}: {importance:.3f}")

# depth of the trees
tree_depths = [estimator.tree_.max_depth for estimator in
               random_forrest_model.estimators_]
print(f"Depth of each tree: {tree_depths}")

# compare some trua and predicted values
results_df = pd.DataFrame(
    {"Price": y_test, "Prediction": y_pred})
print(results_df.head())

# metrics
print(f"R2 score: {r2}")
print(f"RMSE: {rmse}")




Number of trees: 100
Feature importances:
area: 0.460
bedrooms: 0.137
bathrooms: 0.060
stories: 0.056
parking: 0.045
mainroad_no: 0.043
mainroad_yes: 0.038
guestroom_no: 0.037
guestroom_yes: 0.016
basement_no: 0.015
basement_yes: 0.013
hotwaterheating_no: 0.013
hotwaterheating_yes: 0.013
airconditioning_no: 0.011
airconditioning_yes: 0.011
prefarea_no: 0.008
prefarea_yes: 0.007
furnishingstatus_furnished: 0.007
furnishingstatus_semi-furnished: 0.006
furnishingstatus_unfurnished: 0.005
Depth of each tree: [17, 19, 15, 17, 16, 18, 19, 16, 17, 18, 15, 14, 15, 15, 16, 17, 18, 15, 16, 17, 20, 15, 17, 17, 19, 18, 17, 19, 22, 17, 18, 16, 15, 15, 17, 15, 17, 16, 16, 19, 14, 19, 16, 13, 18, 15, 14, 18, 15, 17, 18, 14, 18, 16, 17, 19, 17, 16, 16, 17, 19, 17, 17, 16, 21, 18, 17, 14, 15, 14, 16, 19, 16, 14, 17, 16, 19, 18, 16, 16, 15, 18, 14, 15, 17, 18, 14, 16, 18, 14, 16, 14, 17, 15, 16, 18, 15, 17, 17, 17]
        Price  Prediction
220   4795000   8924825.0
469   3010000   3883530.0
534   21000