In [1]:
import warnings
import logging
import optuna

optuna.logging.set_verbosity(optuna.logging.WARNING)
logging.getLogger("optuna").setLevel(logging.ERROR)


warnings.filterwarnings('ignore', category=UserWarning, module='optuna')
warnings.filterwarnings('ignore', message='.*Trial .* is omitted in visualization.*')

STUDY_DB_PATH = "sqlite:///../optuna_studies/tuning.db"



  from .autonotebook import tqdm as notebook_tqdm


In [9]:
# Compare multiple studies
import plotly.subplots as sp
import plotly.graph_objects as go
import re


def extract_short_study_name(study_name_tuple):
    """
    turns study name into a shorter readable name.

    The expected format for the study name is:
    {model}_{method}_{dataset}_nfeat_{n_features}_{pca|nonpca}_{count}_{metric}_{version}_study

    Args:
        study_name_tuple (tuple): A tuple containing the study name string, e.g.,
                                  ('LightGBM_rfecv_all_nfeat_60_pca_count_mape_v1_study',)

    Returns:
        str: A formatted string like "model dataset nfeatures pca|nonpca",
             or None if the 'nfeat' marker is not found in the study name.
    """
    # Extract the actual study name string from the input tuple
    study_name = study_name_tuple

    # Split the study name by underscores to get its individual components
    parts = study_name.split('_')

    # Extract the model, which is always the first part
    model = parts[0]

    try:
        # Find the index of the 'nfeat' marker to locate subsequent elements
        nfeat_idx = parts.index('nfeat')

        # Extract the number of features, which is immediately after 'nfeat'
        n_features = parts[nfeat_idx + 1]

        # Extract the PCA/non-PCA status, which is after n_features
        pca_nonpca = parts[nfeat_idx + 2]
    except ValueError:
        print(f"Warning: 'nfeat' marker not found in study name: {study_name}. "
              "Cannot extract all required information.")
        return None
    except IndexError:

        print(f"Warning: Insufficient parts after 'nfeat' in study name: {study_name}. "
              "Cannot extract all required information.")
        return None


    dataset_parts = parts[2:nfeat_idx]
    dataset = "_".join(dataset_parts) 

    shorter_name = f"{model} {dataset} {n_features} {pca_nonpca}"
    return shorter_name



studies_to_compare = [
    'LightGBM_rfecv_all_nfeat_60_pca_count_mape_v1_study',
    'LightGBM_rfecv_all_nfeat_80_nonpca_count_mape_v1_study',
    'LightGBM_rfecv_base_poi_pano_nfeat_160_nonpca_count_loc_mape_v1_study',
    'LightGBM_rfecv_base_nfeat_117_nonpca_count_loc_mape_v1_study'
]

# Load all studies
loaded_studies = {
    name: optuna.load_study(study_name=name, storage=STUDY_DB_PATH)
    for name in studies_to_compare
}

# Create subplots for optimization history
fig = sp.make_subplots(
    rows=2, cols=2,
    subplot_titles=[extract_short_study_name(name) for name in studies_to_compare],
    vertical_spacing=0.15,
    horizontal_spacing=0.1
)

# Plot optimization history for each study
for idx, (name, study) in enumerate(loaded_studies.items()):
    row = idx // 2 + 1
    col = idx % 2 + 1
    
    # Get optimization history data
    trials = study.trials
    values = [t.value for t in trials if t.value is not None]
    best_values = [min(values[:i+1]) for i in range(len(values))]
    
    # Add traces
    fig.add_trace(
        go.Scatter(y=values, name=f"{name} (trials)", showlegend=False, 
                  mode='markers', marker=dict(size=4)),
        row=row, col=col
    )
    fig.add_trace(
        go.Scatter(y=best_values, name=f"{name} (best)", showlegend=False,
                  line=dict(color='red')),
        row=row, col=col
    )
    
    # Update axes labels
    fig.update_xaxes(title_text="Trial", row=row, col=col)
    fig.update_yaxes(title_text="MAPE", row=row, col=col)

fig.update_layout(
    height=800, 
    width=1200,
    title_text="Optimization History Comparison",
    title_x=0.5
)
fig.show()

# Print convergence statistics
print("\nConvergence Statistics:")
print("=" * 80)
for name, study in loaded_studies.items():
    trials = study.trials
    values = [t.value for t in trials if t.value is not None]
    best_value = min(values)
    
    # Calculate when we reached within 2.5% of best value
    threshold = best_value * 1.025
    convergence_trial = next(
        (i for i, v in enumerate(values) if v <= threshold), 
        len(values)
    )
    
    print(f"\nStudy: {name}")
    print(f"Best MAPE: {best_value:.4f} 2.5% of best: {threshold:.4f}")
    print(f"Trials to reach within 2.5% of best: {convergence_trial}")
    print(f"Total trials: {len(values)}")



Convergence Statistics:

Study: LightGBM_rfecv_all_nfeat_60_pca_count_mape_v1_study
Best MAPE: 0.0579 2.5% of best: 0.0593
Trials to reach within 2.5% of best: 16
Total trials: 100

Study: LightGBM_rfecv_all_nfeat_80_nonpca_count_mape_v1_study
Best MAPE: 0.0578 2.5% of best: 0.0593
Trials to reach within 2.5% of best: 20
Total trials: 100

Study: LightGBM_rfecv_base_poi_pano_nfeat_160_nonpca_count_loc_mape_v1_study
Best MAPE: 0.0563 2.5% of best: 0.0577
Trials to reach within 2.5% of best: 29
Total trials: 100

Study: LightGBM_rfecv_base_nfeat_117_nonpca_count_loc_mape_v1_study
Best MAPE: 0.0559 2.5% of best: 0.0573
Trials to reach within 2.5% of best: 16
Total trials: 100


In [12]:
# Compare hyperparameter importance and search spaces
fig = sp.make_subplots(
    rows=2, cols=2,
    subplot_titles=[name for name in studies_to_compare],
    vertical_spacing=0.4,
    horizontal_spacing=0.1
)

for idx, (name, study) in enumerate(loaded_studies.items()):
    row = idx // 2 + 1
    col = idx % 2 + 1
    
    # Get hyperparameter importance
    importance = optuna.importance.get_param_importances(study)
    params = list(importance.keys())
    scores = list(importance.values())
    
    # Add bar plot
    fig.add_trace(
        go.Bar(x=params, y=scores),
        row=row, col=col
    )
    
    # Update axes
    fig.update_xaxes(tickangle=45, row=row, col=col)
    fig.update_yaxes(title_text="Importance", row=row, col=col)

fig.update_layout(
    height=800,
    width=1200,
    title_text="Hyperparameter Importance Comparison",
    title_x=0.5,
    showlegend=False
)
fig.show()

# Analyze search spaces and best parameters
print("\nSearch Space Analysis:")
print("=" * 80)
for name, study in loaded_studies.items():
    print(f"\nStudy: {name}")
    print("-" * 40)
    
    best_trial = study.best_trial
    
    # Get all parameter names from all trials
    all_params = set()
    for trial in study.trials:
        all_params.update(trial.params.keys())
    
    # Analyze parameter ranges and best values
    for param in sorted(all_params):
        values = [t.params.get(param) for t in study.trials if param in t.params]
        if values:            
            # Check if values are numeric or categorical
            if all(isinstance(v, (int, float)) for v in values):

                
                # Check if best value is near bounds
                min_val, max_val = min(values), max(values)
                best_val = best_trial.params.get(param)
                if best_val is not None:
                    range_size = max_val - min_val
                    if range_size > 0 and (abs(best_val - min_val) < 0.05 * range_size or abs(best_val - max_val) < 0.05 * range_size):
                        print(f"\nParameter: {param}")
                        print(f"Range: [{min(values):.4f}, {max(values):.4f}]")
                        best_val = best_trial.params.get(param)
                        print(f"Best value: {best_val:.4f}")
                        if abs(best_val - min_val) < 0.05 * range_size:
                            print("⚠️ Best value is near lower bound - consider extending range lower")
                        elif abs(best_val - max_val) < 0.05 * range_size:
                            print("⚠️ Best value is near upper bound - consider extending range higher")
            else:
                unique_values = sorted(set(values))
                print(f"Categorical values: {unique_values}")
                print(f"Best value: {best_trial.params.get(param, 'N/A')}")



Search Space Analysis:

Study: LightGBM_rfecv_all_nfeat_60_pca_count_mape_v1_study
----------------------------------------
Categorical values: ['huber', 'regression_l1', 'regression_l2']
Best value: huber

Parameter: model__reg_alpha
Range: [0.0100, 155.9724]
Best value: 0.0544
⚠️ Best value is near lower bound - consider extending range lower

Parameter: model__reg_lambda
Range: [0.0101, 183.1142]
Best value: 0.0308
⚠️ Best value is near lower bound - consider extending range lower

Study: LightGBM_rfecv_all_nfeat_80_nonpca_count_mape_v1_study
----------------------------------------

Parameter: model__num_leaves
Range: [21.0000, 200.0000]
Best value: 196.0000
⚠️ Best value is near upper bound - consider extending range higher
Categorical values: ['huber', 'regression_l1', 'regression_l2']
Best value: huber

Parameter: model__reg_alpha
Range: [0.0100, 140.4674]
Best value: 0.0202
⚠️ Best value is near lower bound - consider extending range lower

Parameter: model__reg_lambda
Range: 

In [13]:
# Create slice plots for each study
for name, study in loaded_studies.items():
    print(f"\nSlice plot for: {name}")
    fig = optuna.visualization.plot_slice(study)
    fig.update_layout(
        #height=800,
        #width=1200,
        title_text=f"Slice Plot - {extract_short_study_name(name)}",
        #title_x=0.5
    )
    fig.show()



Slice plot for: LightGBM_rfecv_all_nfeat_60_pca_count_mape_v1_study



Slice plot for: LightGBM_rfecv_all_nfeat_80_nonpca_count_mape_v1_study



Slice plot for: LightGBM_rfecv_base_poi_pano_nfeat_160_nonpca_count_loc_mape_v1_study



Slice plot for: LightGBM_rfecv_base_nfeat_117_nonpca_count_loc_mape_v1_study
