In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Step 1: Load dataset
housing = fetch_california_housing()
X = pd.DataFrame(housing.data, columns=housing.feature_names)
y = housing.target

# Step 2: Preprocessing – scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 3: Feature selection – top 5 correlated features
corr = pd.DataFrame(X, columns=housing.feature_names).corrwith(pd.Series(y))
top_features = corr.abs().sort_values(ascending=False).head(5).index
X_selected = X[top_features]
print("Top 5 correlated features:\n", top_features.tolist())

# Step 4: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Step 5: Train regression model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Step 6: Predictions
y_pred = rf.predict(X_test)

# Step 7: Evaluation
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Regression Model Evaluation Metrics:")
print("MAE  :", round(mae, 3))
print("MSE  :", round(mse, 3))
print("RMSE :", round(rmse, 3))
print("R²   :", round(r2, 3))




Top 5 correlated features:
 ['MedInc', 'AveRooms', 'Latitude', 'HouseAge', 'AveBedrms']
Regression Model Evaluation Metrics:
MAE  : 0.469
MSE  : 0.453
RMSE : 0.673
R²   : 0.654


In [None]:
# Step 8: Optional – Plot predicted vs actual values
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], cmap='red')
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs Predicted Values (Random Forest)")
plt.show()