<a href="https://colab.research.google.com/github/smm-0216/MLOps-tools/blob/main/Optuna.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%pip install optuna

Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.4.0-py3-none-any.whl (395 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.2-py3-none-any.whl (242 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.7/242.7 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.2 colorlog-6.9.0 optuna-4.4.0


In [2]:
import optuna
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import pandas as pd

In [None]:
data = pd.read_csv('/content/drive/MyDrive/MLOps - 2024_2/Pipeline/medical_insurance.csv')
encoder = OneHotEncoder(drop='first')
encoder.fit(data[['sex', 'region', 'smoker']])
data[encoder.get_feature_names_out()] = encoder.transform(data[['sex', 'region', 'smoker']]).toarray()
data.drop(['sex', 'region', 'smoker'], axis=1, inplace=True)

scaler = StandardScaler()
X = data.drop('charges', axis=1)
y = data['charges']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 200, 600)
    max_depth = trial.suggest_int('max_depth', 5, 30)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 5)

    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred)

    return mse

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)

print("Mejores hiperparámetros encontrados:")
print(study.best_params)

[I 2024-11-26 02:13:37,432] A new study created in memory with name: no-name-88296f5b-b567-498b-964c-994f73bd10b9
[I 2024-11-26 02:13:39,877] Trial 0 finished with value: 10408885.041470371 and parameters: {'n_estimators': 442, 'max_depth': 17, 'min_samples_split': 5, 'min_samples_leaf': 1}. Best is trial 0 with value: 10408885.041470371.
[I 2024-11-26 02:13:41,312] Trial 1 finished with value: 17684436.808493335 and parameters: {'n_estimators': 362, 'max_depth': 9, 'min_samples_split': 8, 'min_samples_leaf': 5}. Best is trial 0 with value: 10408885.041470371.
[I 2024-11-26 02:13:42,489] Trial 2 finished with value: 12313932.733162865 and parameters: {'n_estimators': 217, 'max_depth': 18, 'min_samples_split': 2, 'min_samples_leaf': 2}. Best is trial 0 with value: 10408885.041470371.
[I 2024-11-26 02:13:44,337] Trial 3 finished with value: 20178325.789431218 and parameters: {'n_estimators': 580, 'max_depth': 6, 'min_samples_split': 9, 'min_samples_leaf': 5}. Best is trial 0 with value: 

Mejores hiperparámetros encontrados:
{'n_estimators': 425, 'max_depth': 20, 'min_samples_split': 2, 'min_samples_leaf': 1}


In [None]:
best_params = study.best_params
final_model = RandomForestRegressor(
    **best_params,
    random_state=42
)
final_model.fit(X_train, y_train)
final_y_pred = final_model.predict(X_test)
final_mse = mean_squared_error(y_test, final_y_pred)
print(f"Mean Squared Error del modelo final: {final_mse}")

Mean Squared Error del modelo final: 7649730.555062296


In [None]:
final_model.score(X_test,y_test)

0.9501584385723729

In [None]:
import optuna.visualization as vis

vis.plot_optimization_history(study).show()

vis.plot_slice(study).show()