In [None]:
import pandas as pd
import numpy as np

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

In [None]:
data = fetch_california_housing(as_frame=True)

df = data.frame
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MedInc       20640 non-null  float64
 1   HouseAge     20640 non-null  float64
 2   AveRooms     20640 non-null  float64
 3   AveBedrms    20640 non-null  float64
 4   Population   20640 non-null  float64
 5   AveOccup     20640 non-null  float64
 6   Latitude     20640 non-null  float64
 7   Longitude    20640 non-null  float64
 8   MedHouseVal  20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


In [None]:
X = df.drop(columns="MedHouseVal")
y = df["MedHouseVal"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), X.columns)
    ]
)

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
from sklearn.pipeline import Pipeline

model = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("regressor", LinearRegression())
])

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("MAE :", mae)
print("R² :", r2)

MAE : 0.5332001304956565
R² : 0.575787706032451


In [None]:
y_train_pred = model.predict(X_train)

mae_train = mean_absolute_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

print("Train MAE :", mae_train)
print("Train R² :", r2_train)

Train MAE : 0.5286283596581934
Train R² : 0.6125511913966952


In [None]:
residuals = y_test - y_pred

print(residuals.describe())

count    4128.000000
mean        0.003479
std         0.745664
min        -9.875331
25%        -0.460935
50%        -0.122439
75%         0.312442
max         4.148388
Name: MedHouseVal, dtype: float64


In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rf_model = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("regressor", RandomForestRegressor(
        n_estimators=200,
        random_state=42,
        n_jobs=-1
    ))
])

In [None]:
rf_model.fit(X_train, y_train)

In [None]:
y_pred_rf = rf_model.predict(X_test)

mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("RandomForest MAE :", mae_rf)
print("RandomForest R² :", r2_rf)

RandomForest MAE : 0.3267529450096902
RandomForest R² : 0.8063074586513359


In [None]:
importances = rf_model.named_steps["regressor"].feature_importances_

feature_importance = pd.Series(importances, index=X.columns).sort_values(ascending=False)
feature_importance

Unnamed: 0,0
MedInc,0.525886
AveOccup,0.138055
Latitude,0.088647
Longitude,0.088307
HouseAge,0.054355
AveRooms,0.044449
Population,0.030693
AveBedrms,0.029608
