In [None]:
# 2 quick notes:  Data Leakage & Bayesian Optimization

In [1]:
!pip install bayesian-optimization scikit-learn pandas

Collecting bayesian-optimization
  Downloading bayesian_optimization-1.4.3-py3-none-any.whl (18 kB)
Collecting colorama>=0.4.6 (from bayesian-optimization)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: colorama, bayesian-optimization
Successfully installed bayesian-optimization-1.4.3 colorama-0.4.6


In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, train_test_split
from bayes_opt import BayesianOptimization

In [3]:
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv', sep=';')
X = data.drop('quality', axis=1)
y = data['quality']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
X.shape

(1599, 11)

In [6]:
def objective(n_estimators, max_depth, min_samples_split, max_features):
    model = RandomForestRegressor(n_estimators=int(n_estimators),
                                  max_depth=int(max_depth),
                                  min_samples_split=int(min_samples_split),
                                  max_features=min(max_features, 0.999),  # Fraction, must be <= 1.0
                                  random_state=42)

    return -1.0 * cross_val_score(model, X_train, y_train, cv=3, scoring="neg_mean_squared_error").mean()

In [7]:
# Bounds for hyperparameters
param_bounds = {
    'n_estimators': (10, 250),
    'max_depth': (1, 50),
    'min_samples_split': (2, 25),
    'max_features': (0.1, 0.999),
}

In [8]:
optimizer = BayesianOptimization(f=objective, pbounds=param_bounds, random_state=42)
optimizer.maximize(init_points=5, n_iter=15)

|   iter    |  target   | max_depth | max_fe... | min_sa... | n_esti... |
-------------------------------------------------------------------------
| [0m1        [0m | [0m0.3948   [0m | [0m19.35    [0m | [0m0.9547   [0m | [0m18.84    [0m | [0m153.7    [0m |
| [95m2        [0m | [95m0.3985   [0m | [95m8.645    [0m | [95m0.2402   [0m | [95m3.336    [0m | [95m217.9    [0m |
| [0m3        [0m | [0m0.3797   [0m | [0m30.45    [0m | [0m0.7366   [0m | [0m2.473    [0m | [0m242.8    [0m |
| [0m4        [0m | [0m0.3808   [0m | [0m41.79    [0m | [0m0.2909   [0m | [0m6.182    [0m | [0m54.02    [0m |
| [0m5        [0m | [0m0.3886   [0m | [0m15.91    [0m | [0m0.5718   [0m | [0m11.93    [0m | [0m79.89    [0m |
| [0m6        [0m | [0m0.3925   [0m | [0m8.633    [0m | [0m0.3423   [0m | [0m4.297    [0m | [0m217.6    [0m |
| [95m7        [0m | [95m0.4288   [0m | [95m7.583    [0m | [95m0.1053   [0m | [95m2.683    [0m | [95m21

In [10]:
best_params = optimizer.max['params']
best_params

{'max_depth': 4.723019905906878,
 'max_features': 0.26880661490436264,
 'min_samples_split': 4.498807222593216,
 'n_estimators': 216.96593364733022}

In [12]:
best_params_formatted = {
    'n_estimators': int(best_params['n_estimators']),
    'max_depth': int(best_params['max_depth']),
    'min_samples_split': int(best_params['min_samples_split']),
    'max_features': best_params['max_features']
}
best_params_formatted

{'n_estimators': 216,
 'max_depth': 4,
 'min_samples_split': 4,
 'max_features': 0.26880661490436264}

In [13]:
optimized_rf = RandomForestRegressor(**best_params_formatted, random_state=42)


In [14]:
optimized_rf.fit(X_train, y_train)

In [15]:
score = optimized_rf.score(X_test, y_test)
print(f"Test R^2 Score with Optimized Hyperparameters: {score}")

Test R^2 Score with Optimized Hyperparameters: 0.377858715533007
