In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, r2_score
from sklearn.compose import make_column_selector
from sklearn.compose import make_column_transformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings("ignore")

In [2]:
from sklearn.datasets import fetch_california_housing

X, y = fetch_california_housing(as_frame=True, return_X_y=True)
X, y 

(       MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
 0      8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
 1      8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
 2      7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
 3      5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
 4      3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   
 ...       ...       ...       ...        ...         ...       ...       ...   
 20635  1.5603      25.0  5.045455   1.133333       845.0  2.560606     39.48   
 20636  2.5568      18.0  6.114035   1.315789       356.0  3.122807     39.49   
 20637  1.7000      17.0  5.205543   1.120092      1007.0  2.325635     39.43   
 20638  1.8672      18.0  5.329513   1.171920       741.0  2.123209     39.43   
 20639  2.3886      16.0  5.254717   1.162264      1387.0  2.616981     39.37   
 
        Longitude  
 0    

In [3]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   Latitude    20640 non-null  float64
 7   Longitude   20640 non-null  float64
dtypes: float64(8)
memory usage: 1.3 MB


In [4]:
rfc = RandomForestRegressor(random_state=24, n_estimators=10)
kfold = KFold(n_splits=5,random_state=24, shuffle=True)

params = {
    'max_features' : [2,3,4,5,6,7],
    'max_depth':[None, 3, 5],
    'min_samples_split': [2,10,50],
    'min_samples_leaf':[1,10,50],
    'n_estimators':[20,50]
}

gcv = GridSearchCV(
    estimator=rfc,
    cv=kfold,
    verbose=2,
    scoring='r2',
    param_grid=params
)

gcv.fit(X,y)

gcv.best_params_, gcv.best_estimator_

Fitting 5 folds for each of 324 candidates, totalling 1620 fits
[CV] END max_depth=None, max_features=2, min_samples_leaf=1, min_samples_split=2, n_estimators=20; total time=   1.2s
[CV] END max_depth=None, max_features=2, min_samples_leaf=1, min_samples_split=2, n_estimators=20; total time=   1.1s
[CV] END max_depth=None, max_features=2, min_samples_leaf=1, min_samples_split=2, n_estimators=20; total time=   1.2s
[CV] END max_depth=None, max_features=2, min_samples_leaf=1, min_samples_split=2, n_estimators=20; total time=   1.1s
[CV] END max_depth=None, max_features=2, min_samples_leaf=1, min_samples_split=2, n_estimators=20; total time=   1.1s
[CV] END max_depth=None, max_features=2, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   2.9s
[CV] END max_depth=None, max_features=2, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   2.9s
[CV] END max_depth=None, max_features=2, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=

({'max_depth': None,
  'max_features': 3,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 50},
 RandomForestRegressor(max_features=3, n_estimators=50, random_state=24))

![alt text](image.png)

The **Out-of-Bag (OOB) score** is a way to check how well a Random Forest model performs by testing it on data that wasn’t used to build each tree.

### In short:
- Each tree in a Random Forest skips some data while training (called "out-of-bag" data).
- The model is then tested on this skipped data to get an **OOB score**.
- This gives a quick accuracy estimate without needing extra test data.
- Out-of-bag samples serve as a **validation set** to estimate model accuracy without separate cross-validation.


**Available only** in ensemble methods like `RandomForestClassifier` and `RandomForestRegressor`.

In [5]:
rfc = RandomForestRegressor(random_state=24, n_estimators=10,oob_score=True)
kfold = KFold(n_splits=5,random_state=24, shuffle=True)

params = {
    'max_features' : [2,3,4,5,6,7],
    'max_depth':[None, 3, 5],
    'min_samples_split': [2,10,50],
    'min_samples_leaf':[1,10,50],
    'n_estimators':[20,50]
}

rcv = RandomizedSearchCV(
    estimator=rfc,
    cv=kfold,
    verbose=2,
    scoring='r2',
    param_distributions=params,
    n_iter=15,
    random_state=24,
)

rcv.fit(X,y)

rcv.best_params_, rcv.best_estimator_,rcv.best_estimator_.oob_score_

Fitting 5 folds for each of 15 candidates, totalling 75 fits
[CV] END max_depth=None, max_features=3, min_samples_leaf=10, min_samples_split=2, n_estimators=50; total time=   2.5s
[CV] END max_depth=None, max_features=3, min_samples_leaf=10, min_samples_split=2, n_estimators=50; total time=   2.6s
[CV] END max_depth=None, max_features=3, min_samples_leaf=10, min_samples_split=2, n_estimators=50; total time=   2.6s
[CV] END max_depth=None, max_features=3, min_samples_leaf=10, min_samples_split=2, n_estimators=50; total time=   2.6s
[CV] END max_depth=None, max_features=3, min_samples_leaf=10, min_samples_split=2, n_estimators=50; total time=   2.6s
[CV] END max_depth=None, max_features=7, min_samples_leaf=50, min_samples_split=10, n_estimators=50; total time=   4.0s
[CV] END max_depth=None, max_features=7, min_samples_leaf=50, min_samples_split=10, n_estimators=50; total time=   4.0s
[CV] END max_depth=None, max_features=7, min_samples_leaf=50, min_samples_split=10, n_estimators=50; tot

({'n_estimators': 20,
  'min_samples_split': 2,
  'min_samples_leaf': 1,
  'max_features': 5,
  'max_depth': None},
 RandomForestRegressor(max_features=5, n_estimators=20, oob_score=True,
                       random_state=24),
 0.7851320095221819)

In [6]:
list(pd.DataFrame(rcv.cv_results_)['params'])

[{'n_estimators': 50,
  'min_samples_split': 2,
  'min_samples_leaf': 10,
  'max_features': 3,
  'max_depth': None},
 {'n_estimators': 50,
  'min_samples_split': 10,
  'min_samples_leaf': 50,
  'max_features': 7,
  'max_depth': None},
 {'n_estimators': 20,
  'min_samples_split': 50,
  'min_samples_leaf': 10,
  'max_features': 3,
  'max_depth': 5},
 {'n_estimators': 20,
  'min_samples_split': 10,
  'min_samples_leaf': 10,
  'max_features': 6,
  'max_depth': 5},
 {'n_estimators': 20,
  'min_samples_split': 50,
  'min_samples_leaf': 10,
  'max_features': 4,
  'max_depth': 3},
 {'n_estimators': 20,
  'min_samples_split': 2,
  'min_samples_leaf': 1,
  'max_features': 2,
  'max_depth': 3},
 {'n_estimators': 20,
  'min_samples_split': 2,
  'min_samples_leaf': 1,
  'max_features': 5,
  'max_depth': None},
 {'n_estimators': 20,
  'min_samples_split': 2,
  'min_samples_leaf': 10,
  'max_features': 5,
  'max_depth': 5},
 {'n_estimators': 50,
  'min_samples_split': 10,
  'min_samples_leaf': 1,
  '