<a href="https://colab.research.google.com/github/pranavsrinivas29/hyperparameter_opt/blob/main/Gradient_Boosting_Regressor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Gradient Boosting Regressor W/o Optimization

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

In [None]:
# Load the diabetes dataset
diabetes = load_diabetes()
X = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
y = pd.Series(diabetes.target, name="target")

# Split the data into training and testing sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Preprocess the data (standardize numerical features)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Create a Gradient Boosting Regressor
regressor = GradientBoostingRegressor(n_estimators=100, random_state=42)

# Fit the regressor to the training data
regressor.fit(X_train_scaled, y_train)

# Make predictions on the test data
y_pred = regressor.predict(X_test_scaled)


In [None]:
# Calculate RMSE and R-squared
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared (R2):", r2)

Root Mean Squared Error (RMSE): 55.52776934600318
R-squared (R2): 0.42883338547082817


Randomized Search Based

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# Define hyperparameters and their possible values for randomized search
param_dist_random = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3],
    'max_depth': [3, 4, 5, 6],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a RandomizedSearchCV object
random_search = RandomizedSearchCV(
    regressor,
    param_distributions=param_dist_random,
    n_iter=100,  # Number of random parameter settings to try
    cv=5,  # Number of cross-validation folds
    n_jobs=-1,
    random_state=42
)


In [None]:

# Fit the randomized search to the training data
random_search.fit(X_train_scaled, y_train)

# Get the best estimator (model with best hyperparameters)
best_regressor_random = random_search.best_estimator_

# Make predictions on the test data using the best model
y_pred_random = best_regressor_random.predict(X_test_scaled)

# Calculate RMSE and R-squared for the best model
rmse_random = np.sqrt(mean_squared_error(y_test, y_pred_random))
r2_random = r2_score(y_test, y_pred_random)

print("Root Mean Squared Error (RMSE) for Best Model (Random Search):", rmse_random)
print("R-squared (R2) for Best Model (Random Search):", r2_random)

# Print the best hyperparameters
print("Best Hyperparameters (Random Search):", random_search.best_params_)

Root Mean Squared Error (RMSE) for Best Model (Random Search): 58.90503409298303
R-squared (R2) for Best Model (Random Search): 0.35724245549807165
Best Hyperparameters (Random Search): {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': 6, 'learning_rate': 0.01}


Grid Based Search

In [None]:
from sklearn.model_selection import GridSearchCV
# Create a GridSearchCV object
grid_search = GridSearchCV(
    regressor,
    param_grid=param_dist_random,
    cv=5,  # Number of cross-validation folds
    n_jobs=-1
)

# Fit the grid search to the training data
grid_search.fit(X_train_scaled, y_train)

# Get the best estimator (model with best hyperparameters)
best_regressor_grid = grid_search.best_estimator_

# Make predictions on the test data using the best model
y_pred_grid = best_regressor_grid.predict(X_test_scaled)

# Calculate RMSE and R-squared for the best model
rmse_grid = np.sqrt(mean_squared_error(y_test, y_pred_grid))
r2_grid = r2_score(y_test, y_pred_grid)

print("Root Mean Squared Error (RMSE) for Best Model (Grid Search):", rmse_grid)
print("R-squared (R2) for Best Model (Grid Search):", r2_grid)

# Print the best hyperparameters
print("Best Hyperparameters (Grid Search):", grid_search.best_params_)

Root Mean Squared Error (RMSE) for Best Model (Grid Search): 56.30373231787073
R-squared (R2) for Best Model (Grid Search): 0.4127585138245339
Best Hyperparameters (Grid Search): {'learning_rate': 0.01, 'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 200}


Bayesian

In [None]:
pip install scikit-optimize


Collecting scikit-optimize
  Downloading scikit_optimize-0.9.0-py2.py3-none-any.whl (100 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.3/100.3 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-23.9.6-py3-none-any.whl (22 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-23.9.6 scikit-optimize-0.9.0


In [None]:
from skopt import BayesSearchCV

# Create a BayesSearchCV object
bayesian_search = BayesSearchCV(
    regressor,
    search_spaces=param_dist_random,
    n_iter=50,  # Number of iterations/evaluations
    cv=5,  # Number of cross-validation folds
    n_jobs=-1,
    random_state=42,
)

# Fit the Bayesian search to the training data
bayesian_search.fit(X_train_scaled, y_train)

# Get the best estimator (model with best hyperparameters)
best_regressor_bayesian = bayesian_search.best_estimator_

# Make predictions on the test data using the best model
y_pred_bayesian = best_regressor_bayesian.predict(X_test_scaled)

# Calculate RMSE and R-squared for the best model
rmse_bayesian = np.sqrt(mean_squared_error(y_test, y_pred_bayesian))
r2_bayesian = r2_score(y_test, y_pred_bayesian)

print("Root Mean Squared Error (RMSE) for Best Model (Bayesian Search):", rmse_bayesian)
print("R-squared (R2) for Best Model (Bayesian Search):", r2_bayesian)

# Print the best hyperparameters
print("Best Hyperparameters (Bayesian Search):", bayesian_search.best_params_)



Root Mean Squared Error (RMSE) for Best Model (Bayesian Search): 56.30373231787073
R-squared (R2) for Best Model (Bayesian Search): 0.4127585138245339
Best Hyperparameters (Bayesian Search): OrderedDict([('learning_rate', 0.01), ('max_depth', 5), ('min_samples_leaf', 2), ('min_samples_split', 10), ('n_estimators', 200)])
