In [2]:
!pip -q install optuna

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import optuna

# Load the dataset
df = pd.read_csv('/content/happy.csv')

# Define feature columns and target column (using normalized features)
features = ['feat1', 'feat2', 'feat3', 'feat4', 'feat5']
target = 'z_spec'

# Handle missing values
df = df.dropna(subset=features + [target])

X = df[features]
y = df[target]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the objective function for Optuna
def objective(trial):
    model_type = trial.suggest_categorical('model_type', ['RandomForest', 'GradientBoosting'])
    if model_type == 'RandomForest':
        n_estimators = trial.suggest_int('n_estimators', 50, 300)
        max_depth = trial.suggest_int('max_depth', 10, 50)
        model = RandomForestRegressor(
            n_estimators=n_estimators,
            max_depth=max_depth,
            random_state=42
        )
    else:
        learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.2)
        n_estimators = trial.suggest_int('n_estimators', 50, 300)
        max_depth = trial.suggest_int('max_depth', 10, 50)
        model = GradientBoostingRegressor(
            learning_rate=learning_rate,
            n_estimators=n_estimators,
            max_depth=max_depth,
            random_state=42
        )

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    return mse

# Create a study and optimize the objective function
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

# Get the best hyperparameters
best_params = study.best_params
print(f'Best hyperparameters: {best_params}')

# Train and evaluate the best model
if best_params['model_type'] == 'RandomForest':
    best_model = RandomForestRegressor(
        n_estimators=best_params['n_estimators'],
        max_depth=best_params['max_depth'],
        random_state=42
    )
else:
    best_model = GradientBoostingRegressor(
        learning_rate=best_params['learning_rate'],
        n_estimators=best_params['n_estimators'],
        max_depth=best_params['max_depth'],
        random_state=42
    )

best_model.fit(X_train, y_train)
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = np.sqrt(mse_train)
r2_train = best_model.score(X_train, y_train)

mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)
r2_test = best_model.score(X_test, y_test)

print(f'Training MSE: {mse_train}')
print(f'Training RMSE: {rmse_train}')
print(f'Training R²: {r2_train}')
print(f'Test MSE: {mse_test}')
print(f'Test RMSE: {rmse_test}')
print(f'Test R²: {r2_test}')

[I 2024-07-09 05:53:37,329] A new study created in memory with name: no-name-40a7dad3-5082-4ae3-8708-0f7e92d63d30
[I 2024-07-09 05:58:10,750] Trial 0 finished with value: 0.010602612222224458 and parameters: {'model_type': 'RandomForest', 'n_estimators': 253, 'max_depth': 11}. Best is trial 0 with value: 0.010602612222224458.
  learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.2)
[I 2024-07-09 06:02:15,167] Trial 1 finished with value: 0.01550723495168882 and parameters: {'model_type': 'GradientBoosting', 'learning_rate': 0.10971161273986456, 'n_estimators': 75, 'max_depth': 32}. Best is trial 0 with value: 0.010602612222224458.
  learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.2)
[I 2024-07-09 06:05:22,514] Trial 2 finished with value: 0.016369350024180183 and parameters: {'model_type': 'GradientBoosting', 'learning_rate': 0.011611294071836748, 'n_estimators': 73, 'max_depth': 19}. Best is trial 0 with value: 0.010602612222224458.
[I 2024-07-09 06:09: