In [1]:
# Author: Roi Yehoshua <roiyeho@gmail.com>
# License: MIT

import numpy as np
import matplotlib.pyplot as plt

np.random.seed(0)

In [2]:
from sklearn.datasets import fetch_california_housing

data = fetch_california_housing()
X, y = data.data, data.target
feature_names = data.feature_names

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [4]:
from sklearn.ensemble import RandomForestRegressor

reg = RandomForestRegressor()
reg.fit(X_train, y_train)

RandomForestRegressor()

In [5]:
train_score = reg.score(X_train, y_train)
print(f'R2 score (train): {train_score:.4f}')

test_score = reg.score(X_test, y_test)
print(f'R2 score (test): {test_score:.4f}')

R2 score (train): 0.9727
R2 score (test): 0.7980


Feature importance

In [6]:
# Sort the features by their importances in decreasing order
idx = np.argsort(reg.feature_importances_)[::-1]
feature_names = np.array(feature_names)[idx]
feature_importances = reg.feature_importances_[idx]

# Print the features alongside their importances
for name, score in zip(feature_names, feature_importances):
    print(f'{name}: {score:.4f}')

MedInc: 0.5295
AveOccup: 0.1353
Latitude: 0.0848
Longitude: 0.0837
HouseAge: 0.0558
AveRooms: 0.0473
Population: 0.0330
AveBedrms: 0.0306
