In [2]:
# Exercise 3: GridSearchCV

In [3]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
import numpy as np

In [6]:
# Load data
housing = fetch_california_housing()
X, y = housing['data'], housing['target']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, shuffle=True, random_state=43
)

# Define model
rf = RandomForestRegressor(random_state=43)

# Parameter grid (minimum 3 values each)
param_grid = {
    'max_depth': [5, 10, 20],          # You can expand later
    'n_estimators': [10, 50, 100]
}

# GridSearchCV with 5-fold CV, MSE scoring
grid = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_squared_error',  # sklearn minimizes negative MSE
    n_jobs=-1,  # Use all CPUs
    verbose=1
)

# Fit
grid.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


0,1,2
,estimator,RandomForestR...ndom_state=43)
,param_grid,"{'max_depth': [5, 10, ...], 'n_estimators': [10, 50, ...]}"
,scoring,'neg_mean_squared_error'
,n_jobs,-1
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,20
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [12]:
# Extract the best model and its performance
best_model = grid.best_estimator_

print("Best Parameters:", grid.best_params_)
print("Best Score (validation MSE):", -grid.best_score_)  # convert back from negative
print("\nBest Estimator:\n", best_model)

Best Parameters: {'max_depth': 20, 'n_estimators': 100}
Best Score (validation MSE): 0.25679220412884723

Best Estimator:
 RandomForestRegressor(max_depth=20, random_state=43)


In [13]:
# Evaluate on test set
test_score = -grid.score(X_test, y_test)
print("\nTest Set MSE:", test_score)


Test Set MSE: 0.24221341009242012


In [14]:
import pandas as pd

# View all cross-validation results
print("\nCV Results:\n", pd.DataFrame(grid.cv_results_))


CV Results:
    mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0       0.859086      0.010191         0.005186        0.000060   
1       4.188849      0.036348         0.019976        0.000556   
2       8.490310      0.044673         0.039287        0.002021   
3       1.634984      0.021843         0.008163        0.000270   
4       8.607656      0.304968         0.044978        0.017275   
5      18.370954      0.953265         0.078009        0.006492   
6       3.041216      0.260773         0.016485        0.000377   
7      16.304078      0.312065         0.104102        0.017522   
8      27.557650      1.117248         0.120032        0.015368   

   param_max_depth  param_n_estimators  \
0                5                  10   
1                5                  50   
2                5                 100   
3               10                  10   
4               10                  50   
5               10                 100   
6               20   

In [15]:
## Key Learnings
#     GridSearchCV automates model selection and hyperparameter tuning.  
#     It uses cross-validation internally — each combination is validated multiple times for robustness.
#     MSE (Mean Squared Error) measures how close predictions are to actual values — lower is better.
#     The best model is the one that minimizes the validation MSE (i.e., maximizes neg_mean_squared_error).
#     After selecting the best model, always evaluate it on a held-out test set.