In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

imports


In [2]:
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor

from sklearn.metrics import mean_absolute_error, confusion_matrix,\
ConfusionMatrixDisplay, classification_report

from sklearn.model_selection import train_test_split,\
cross_validate, cross_val_score, ShuffleSplit, \
RandomizedSearchCV

from sklearn.tree import DecisionTreeRegressor

In [3]:
np.random.seed(306) 


shuffle-split

In [4]:
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)


download and spliting into test/taste

In [5]:
# fetch dataset
features, labels = fetch_california_housing(as_frame=True, return_X_y=True)
labels *=100

# train-test split
com_train_features, test_features, com_train_labels, test_labels = \
    train_test_split(features, labels, random_state=42)

# train --> train + dev split
train_features, dev_features, train_labels, dev_labels = \
    train_test_split(com_train_features, com_train_labels, random_state=42)

Training diff. regressors

In [7]:
def train_regressor(estimator, X_train, y_train, cv, name):
    cv_results = cross_validate(estimator,
                               X_train,
                               y_train,
                               cv=cv,
                               scoring="neg_mean_absolute_error",
                               return_train_score=True,
                               return_estimator=True)
    
    cv_train_error = -1 * cv_results['train_score']
    cv_test_error = -1 * cv_results['test_score']
    
    print(f"On an average, {name} makes an error of "
          f"{cv_train_error.mean():.3f}k +/- {cv_train_error.std():.3f}k on the training set.")
    print(f"On an average, {name} makes an error of "
          f"{cv_test_error.mean():.3f}k +/- {cv_test_error.std():.3f}k on the test set.")

In [8]:
train_regressor(
    RandomForestRegressor(), com_train_features,\
    com_train_labels, cv, 'random forest regressor')

On an average, random forest regressor makes an error of 12.643k +/- 0.061k on the training set.
On an average, random forest regressor makes an error of 33.129k +/- 0.701k on the test set.


In [None]:
Hyperparameter tuning using GridSearchCV with a RandomForestRegressor

In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
import pandas as pd

# Define the hyperparameter grid
param_grid = {
    "n_estimators": [1, 2, 5, 10, 20, 50, 100, 200, 500],
    "max_leaf_nodes": [2, 5, 10, 20, 50, 100],
}

# Instantiate the GridSearchCV object
grid_cv = GridSearchCV(
    RandomForestRegressor(n_jobs=2),
    param_grid=param_grid,
    scoring="neg_mean_absolute_error",
    n_jobs=2,
)

# Fit the model with the training data
grid_cv.fit(com_train_features, com_train_labels)

# Extract and display the results
columns = [f"param_{name}" for name in param_grid.keys()]
columns += ["mean_test_error", "std_test_error"]
cv_results = pd.DataFrame(grid_cv.cv_results_)
cv_results["mean_test_error"] = -cv_results["mean_test_score"]
cv_results["std_test_error"] = cv_results["std_test_score"]
result_table = cv_results[columns].sort_values(by="mean_test_error")

# Display the best parameters and corresponding mean test error
print("Best Parameters:")
print(grid_cv.best_params_)
print("\nBest Mean Test Error:", -grid_cv.best_score_)
print("\nResults Table:")
print(result_table)


Best Parameters:
{'max_leaf_nodes': 100, 'n_estimators': 200}

Best Mean Test Error: 40.5687594290511

Results Table:
   param_n_estimators param_max_leaf_nodes  mean_test_error  std_test_error
52                200                  100        40.568759        0.708010
53                500                  100        40.611287        0.782177
51                100                  100        40.631864        0.703829
50                 50                  100        40.910324        0.688116
49                 20                  100        41.001398        0.737265
48                 10                  100        41.472686        0.512427
47                  5                  100        41.980259        1.023331
41                 50                   50        43.798228        0.904086
43                200                   50        43.803850        0.769831
44                500                   50        43.839888        0.773010
42                100                   50    

Hyperparameter Tuning using RandomizedSearchCV with a RandomForestRegressor (1)

In [9]:
param_distributions = {
    "n_estimators": [1, 2, 5, 10, 20, 50, 100, 200, 500],
    "max_leaf_nodes": [2, 5, 10, 20, 50, 100],
}

search_cv = RandomizedSearchCV(
    RandomForestRegressor(n_jobs=2), param_distributions=param_distributions,
    scoring="neg_mean_absolute_error", n_iter=10, random_state=0, n_jobs=2,)

search_cv.fit(com_train_features, com_train_labels)

columns = [f"param_{name}" for name in param_distributions.keys()]
columns += ["mean_test_error", "std_test_error"]
cv_results = pd.DataFrame(search_cv.cv_results_)
cv_results["mean_test_error"] = -cv_results["mean_test_score"]
cv_results["std_test_error"] = cv_results["std_test_score"]
cv_results[columns].sort_values(by="mean_test_error")

Unnamed: 0,param_n_estimators,param_max_leaf_nodes,mean_test_error,std_test_error
0,500,100,40.621985,0.769115
2,10,100,41.302509,0.863997
7,100,50,43.714186,0.799775
8,1,100,46.436338,1.02812
6,50,20,49.389446,1.150302
1,100,20,49.467752,0.997048
9,10,20,50.073348,1.218088
3,500,10,54.983731,1.053561
4,5,5,61.278826,0.942016
5,5,2,73.487031,0.985396


In [11]:
error = -search_cv.score(test_features, test_labels)
print(f"On average, our random forest regressor makes an error of {error:.2f} k$")

On average, our random forest regressor makes an error of 40.53 k$


 2.creating a single pipeline that does the full data preparation plus the final prediction.
 Data preparation : feature selection, scaling, and a RandomForestRegressor
 Final prediction:fitted+prediction(showing results)

3.transformer : SelectFromModel  used to select the most important features

In [12]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Define the hyperparameter distribution for RandomizedSearchCV
param_dist = {
    "regressor__n_estimators": [1, 2, 5, 10, 20, 50, 100, 200, 500],
    "regressor__max_leaf_nodes": [2, 5, 10, 20, 50, 100],
}

# Instantiate the RandomForestRegressor
rf_reg = RandomForestRegressor(n_jobs=2)

# Create a pipeline with feature selection and regression
pipeline = Pipeline([
    ('feature_selection', SelectFromModel(rf_reg)),
    ('scaler', StandardScaler()),  # You can add other preprocessing steps here
    ('regressor', rf_reg)
])

# Instantiate the RandomizedSearchCV object
randomized_cv = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    scoring="neg_mean_absolute_error",
    n_iter=10,  # Number of parameter settings sampled
    n_jobs=2,
)

# Fit the model with the training data
randomized_cv.fit(com_train_features, com_train_labels)

# Extract and display the results
columns = [f"param_{name}" for name in param_dist.keys()]
columns += ["mean_test_error", "std_test_error"]
cv_results = pd.DataFrame(randomized_cv.cv_results_)
cv_results["mean_test_error"] = -cv_results["mean_test_score"]
cv_results["std_test_error"] = cv_results["std_test_score"]
result_table = cv_results[columns].sort_values(by="mean_test_error")

# Display the best parameters and corresponding mean test error
print("Best Parameters:")
print(randomized_cv.best_params_)
print("\nBest Mean Test Error:", -randomized_cv.best_score_)
print("\nResults Table:")
print(result_table)


Best Parameters:
{'regressor__n_estimators': 500, 'regressor__max_leaf_nodes': 50}

Best Mean Test Error: 54.082240941424416

Results Table:
  param_regressor__n_estimators param_regressor__max_leaf_nodes  \
5                           500                              50   
6                            20                              50   
8                           500                              20   
0                             2                              50   
7                             2                              10   
2                             1                              10   
3                            50                               5   
1                           100                               5   
4                            20                               5   
9                             2                               2   

   mean_test_error  std_test_error  
5        54.082241        1.132283  
6        54.205617        1.092575  
8        5