In [1]:
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(0)

In [2]:
from sklearn.datasets import fetch_california_housing

data = fetch_california_housing()
X, y = data.data, data.target
feature_names = data.feature_names

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [4]:
from sklearn.ensemble import RandomForestRegressor

reg = RandomForestRegressor()
reg.fit(X_train, y_train)

RandomForestRegressor()

In [5]:
train_score = reg.score(X_train, y_train)
print('R2 score on the training set:', np.round(train_score, 4))

test_score = reg.score(X_test, y_test)
print('R2 score on the test set:', np.round(test_score, 4))

R2 score on the training set: 0.9734
R2 score on the test set: 0.7978


Feature importance

In [7]:
# Sort the features by their importances in decreasing order
idx = np.argsort(reg.feature_importances_)[::-1]
feature_names = np.array(feature_names)[idx]
feature_importances = reg.feature_importances_[idx]

# Print the features alongside their importances
for name, score in zip(feature_names, feature_importances):
    print(f'{name}: {score:.4f}')

MedInc: 0.5295
AveOccup: 0.1349
Longitude: 0.0855
Latitude: 0.0853
HouseAge: 0.0552
AveRooms: 0.0467
Population: 0.0328
AveBedrms: 0.0301
