In [4]:
import warnings
import logging
import optuna

optuna.logging.set_verbosity(optuna.logging.WARNING)
logging.getLogger("optuna").setLevel(logging.ERROR)


warnings.filterwarnings('ignore', category=UserWarning, module='optuna')
warnings.filterwarnings('ignore', message='.*Trial .* is omitted in visualization.*')

STUDY_DB_PATH = "sqlite:///../optuna_studies/tuning.db"



In [5]:
def get_all_study_names(storage_url):
    """
    Get all study names from the Optuna database.
    
    Args:
        storage_url (str): The storage URL for the Optuna database
        
    Returns:
        list: List of all study names in the database
    """
    import sqlite3
    import urllib.parse
    
    # Parse the storage URL to get the database path
    if storage_url.startswith("sqlite:///"):
        db_path = storage_url[10:]  # Remove "sqlite:///" prefix
    else:
        raise ValueError(f"Unsupported storage URL format: {storage_url}")
    
    try:
        # Connect to the SQLite database
        conn = sqlite3.connect(db_path)
        cursor = conn.cursor()
        
        # Query the studies table to get all study names
        cursor.execute("SELECT study_name FROM studies")
        study_names = [row[0] for row in cursor.fetchall()]
        
        conn.close()
        return study_names
        
    except sqlite3.Error as e:
        print(f"Database error: {e}")
        return []
    except Exception as e:
        print(f"Error: {e}")
        return []

# Get all available study names from the database
all_study_names = get_all_study_names(STUDY_DB_PATH)
print(f"Found {len(all_study_names)} studies in the database:")
for i, name in enumerate(all_study_names, 1):
    print(f"{i:2d}. {name}")
    
print(f"\nDatabase path: {STUDY_DB_PATH}")


Found 66 studies in the database:
 1. LightGBM_rfecv_all_nfeat_60_pca_count_mape_v1_study
 2. LightGBM_rfecv_all_nfeat_80_nonpca_count_mape_v1_study
 3. LightGBM_rfecv_base_poi_pano_nfeat_160_nonpca_count_loc_mape_v1_study
 4. LightGBM_rfecv_base_nfeat_118_nonpca_count_loc_mape_v1_study
 5. LightGBM_rfecv_base_nfeat_117_nonpca_count_loc_mape_v1_study
 6. LightGBM_rfecv_all_nfeat_152_nonpca_count_loc_mape_v1_study
 7. LightGBM_comb_all_nfeat_201_pca_count_loc_mape_v1_study
 8. LightGBM_elasticnet_all_nfeat_296_pca_scaled_count_loc_mape_v1_study
 9. LightGBM_rfecv_base_nfeat_112_nonpca_count_loc_mape_v2_study
10. XGBoost_rfecv_base_nfeat_112_nonpca_count_loc_mape_v2_study
11. LightGBM_rfecv_base_nfeat_112_nonpca_count_loc_mape_v3_study
12. LightGBM_comb_all_nfeat_201_pca_count_loc_mape_v3_study
13. LightGBM_rfecv_all_nfeat_152_nonpca_count_loc_mape_v3_study
14. LightGBM_rfecv_all_nfeat_160_umap_count_loc_mape_v1_study
15. LightGBM_rfecv_all_nfeat_160_pca_count_loc_mape_v1_study
16. Light

In [6]:
# Compare multiple studies
import plotly.subplots as sp
import plotly.graph_objects as go
import re


def extract_short_study_name(study_name_tuple):
    """
    turns study name into a shorter readable name.

    The expected format for the study name is:
    {model}_{method}_{dataset}_nfeat_{n_features}_{pca|nonpca}_{count}_{metric}_{version}_study

    Args:
        study_name_tuple (tuple): A tuple containing the study name string, e.g.,
                                  ('LightGBM_rfecv_all_nfeat_60_pca_count_mape_v1_study',)

    Returns:
        str: A formatted string like "model dataset nfeatures pca|nonpca",
             or None if the 'nfeat' marker is not found in the study name.
    """
    # Extract the actual study name string from the input tuple
    study_name = study_name_tuple

    # Split the study name by underscores to get its individual components
    parts = study_name.split('_')

    # Extract the model, which is always the first part
    model = parts[0]

    try:
        # Find the index of the 'nfeat' marker to locate subsequent elements
        nfeat_idx = parts.index('nfeat')

        # Extract the number of features, which is immediately after 'nfeat'
        n_features = parts[nfeat_idx + 1]

        # Extract the PCA/non-PCA status, which is after n_features
        pca_nonpca = parts[nfeat_idx + 2]


    except ValueError:
        print(f"Warning: 'nfeat' marker not found in study name: {study_name}. "
              "Cannot extract all required information.")
        return None
    except IndexError:

        print(f"Warning: Insufficient parts after 'nfeat' in study name: {study_name}. "
              "Cannot extract all required information.")
        return None
    
    # extract version with the format vX
    version = re.search(r'v(\d+)', study_name).group(1)


    dataset_parts = parts[2:nfeat_idx]
    dataset = "_".join(dataset_parts) 

    shorter_name = f"{model} {dataset} {n_features} {pca_nonpca} v{version}"
    return shorter_name




studies_to_compare = [name for name in all_study_names if "LightGBM" in name and "v8" in name][-8:]

# Load all studies
loaded_studies = {
    name: optuna.load_study(study_name=name, storage=STUDY_DB_PATH)
    for name in studies_to_compare
}

rows = len(studies_to_compare) // 2 + 1
# Create subplots for optimization history
fig = sp.make_subplots(
    rows=rows, cols=2,
    subplot_titles=[extract_short_study_name(name) for name in studies_to_compare],
    vertical_spacing=0.15,
    horizontal_spacing=0.1
)

# Plot optimization history for each study
for idx, (name, study) in enumerate(loaded_studies.items()):
    row = idx // 2 + 1
    col = idx % 2 + 1
    
    # Get optimization history data
    trials = study.trials
    values = [t.value for t in trials if t.value is not None]
    best_values = [min(values[:i+1]) for i in range(len(values))]
    
    # Add traces
    fig.add_trace(
        go.Scatter(y=values, name=f"{name} (trials)", showlegend=False, 
                  mode='markers', marker=dict(size=4)),
        row=row, col=col
    )
    fig.add_trace(
        go.Scatter(y=best_values, name=f"{name} (best)", showlegend=False,
                  line=dict(color='red')),
        row=row, col=col
    )
    
    # Update axes labels
    fig.update_xaxes(title_text="Trial", row=row, col=col)
    fig.update_yaxes(title_text="MAPE", row=row, col=col)

fig.update_layout(
    height=800, 
    width=1200,
    title_text="Optimization History Comparison",
    title_x=0.5
)
fig.show()

# Print convergence statistics
print("\nConvergence Statistics:")
print("=" * 80)
for name, study in loaded_studies.items():
    trials = study.trials
    n_trials_skip = 10
    if len(trials)<n_trials_skip:
        # delete from studies_to_compare
        studies_to_compare.remove(name)
        print(f"Study {name} has less than {n_trials_skip} trials, skipping.")
        continue

    values = [t.value for t in trials if t.value is not None]
    best_value = min(values)

    if best_value < 0 or best_value > 100 or len(values) < n_trials_skip:
        studies_to_compare.remove(name)
        print(f"Study {name} has a negative best value, skipping.")
        continue
    
    # Calculate when we reached within 2.5% of best value
    threshold = best_value * 1.025
    convergence_trial = next(
        (i for i, v in enumerate(values) if v <= threshold), 
        len(values)
    )
    
    print(f"\nStudy: {name}")
    print(f"Best MAPE: {best_value:.4f} 2.5% of best: {threshold:.4f}")
    print(f"Trials to reach within 2.5% of best: {convergence_trial}")
    print(f"Total trials: {len(values)}")


copy_loaded_studies = loaded_studies.copy()

for name, study in copy_loaded_studies.items():
    if name not in studies_to_compare:
        loaded_studies.pop(name)



Convergence Statistics:

Study: LightGBM_rfecv_all_nfeat_158_pca_count_loc_mape_v8_study
Best MAPE: 0.0599 2.5% of best: 0.0614
Trials to reach within 2.5% of best: 14
Total trials: 100

Study: LightGBM_rfecv_base_nfeat_56_nonpca_count_mape_v8_study
Best MAPE: 0.0710 2.5% of best: 0.0728
Trials to reach within 2.5% of best: 14
Total trials: 100

Study: LightGBM_rfecv_base_poi_pano_nfeat_158_nonpca_count_loc_mape_v8_study
Best MAPE: 0.0613 2.5% of best: 0.0628
Trials to reach within 2.5% of best: 6
Total trials: 100

Study: LightGBM_rfecv_base_img_nfeat_56_pca_count_50_mape_v8_study
Best MAPE: 0.0730 2.5% of best: 0.0748
Trials to reach within 2.5% of best: 2
Total trials: 92

Study: LightGBM_rfecv_base_img_nfeat_56_pca_count_mape_v8_study
Best MAPE: 0.0725 2.5% of best: 0.0743
Trials to reach within 2.5% of best: 0
Total trials: 100


In [7]:
studies_to_compare

['LightGBM_rfecv_all_nfeat_158_pca_count_loc_mape_v8_study',
 'LightGBM_rfecv_base_nfeat_56_nonpca_count_mape_v8_study',
 'LightGBM_rfecv_base_poi_pano_nfeat_158_nonpca_count_loc_mape_v8_study',
 'LightGBM_rfecv_base_img_nfeat_56_pca_count_50_mape_v8_study',
 'LightGBM_rfecv_base_img_nfeat_56_pca_count_mape_v8_study']

In [None]:
# Compare hyperparameter importance and search spaces
nrows = len(studies_to_compare) // 2 + 1

fig = sp.make_subplots(
    rows=nrows, cols=2,
    subplot_titles=[name for name in studies_to_compare],
    #vertical_spacing=0.4,
    horizontal_spacing=0.1
)

for idx, (name, study) in enumerate(loaded_studies.items()):
    print(f"Study: {name}")
    row = idx // 2 + 1
    col = idx % 2 + 1
    
    # Get hyperparameter importance
    importance = optuna.importance.get_param_importances(study)
    params = list(importance.keys())
    scores = list(importance.values())
    
    # Add bar plot
    fig.add_trace(
        go.Bar(x=params, y=scores),
        row=row, col=col
    )
    
    # Update axes
    fig.update_xaxes(tickangle=45, row=row, col=col)
    fig.update_yaxes(title_text="Importance", row=row, col=col)

fig.update_layout(
    height=800,
    width=1200,
    title_text="Hyperparameter Importance Comparison",
    title_x=0.5,
    showlegend=False,

)
fig.show()

# Analyze search spaces and best parameters
print("\nSearch Space Analysis:")
print("=" * 80)
for name, study in loaded_studies.items():
    print(f"\nStudy: {name}")
    print("-" * 40)
    
    best_trial = study.best_trial
    
    # Get all parameter names from all trials
    all_params = set()
    for trial in study.trials:
        all_params.update(trial.params.keys())
    
    # Analyze parameter ranges and best values
    for param in sorted(all_params):
        values = [t.params.get(param) for t in study.trials if param in t.params]
        values = [v for v in values if type(v) in (float, int)]  # Filter out None values
        if values:            
            # Check if values are numeric or categorical
            if all(isinstance(v, (int, float)) for v in values):

                
                # Check if best value is near bounds
                min_val, max_val = min(values), max(values)
                best_val = best_trial.params.get(param)
                if best_val is not None:
                    range_size = max_val - min_val
                    if range_size > 0 and (abs(best_val - min_val) < 0.05 * range_size or abs(best_val - max_val) < 0.05 * range_size):
                        print(f"\nParameter: {param}")
                        print(f"Range: [{min(values):.4f}, {max(values):.4f}]")
                        best_val = best_trial.params.get(param)
                        print(f"Best value: {best_val:.4f}")
                        if abs(best_val - min_val) < 0.05 * range_size:
                            print("⚠️ Best value is near lower bound - consider extending range lower")
                        elif abs(best_val - max_val) < 0.05 * range_size:
                            print("⚠️ Best value is near upper bound - consider extending range higher")
            else:
                unique_values = sorted(set(values))
                print(f"Categorical values: {unique_values}")
                print(f"Best value: {best_trial.params.get(param, 'N/A')}")


Study: LightGBM_rfecv_all_nfeat_158_pca_count_loc_mape_v8_study
Study: LightGBM_rfecv_base_nfeat_56_nonpca_count_mape_v8_study
Study: LightGBM_rfecv_base_poi_pano_nfeat_158_nonpca_count_loc_mape_v8_study
Study: LightGBM_rfecv_base_img_nfeat_56_pca_count_50_mape_v8_study
Study: LightGBM_rfecv_base_img_nfeat_56_pca_count_mape_v8_study



Search Space Analysis:

Study: LightGBM_rfecv_all_nfeat_158_pca_count_loc_mape_v8_study
----------------------------------------

Parameter: model__learning_rate
Range: [0.0100, 0.0951]
Best value: 0.0109
⚠️ Best value is near lower bound - consider extending range lower

Parameter: model__reg_alpha
Range: [0.0000, 6.0187]
Best value: 0.0001
⚠️ Best value is near lower bound - consider extending range lower

Parameter: model__reg_lambda
Range: [0.0000, 188.3720]
Best value: 1.5874
⚠️ Best value is near lower bound - consider extending range lower

Study: LightGBM_rfecv_base_nfeat_56_nonpca_count_mape_v8_study
----------------------------------------

Parameter: model__learning_rate
Range: [0.0104, 0.0999]
Best value: 0.0127
⚠️ Best value is near lower bound - consider extending range lower

Parameter: model__min_child_samples
Range: [5.0000, 100.0000]
Best value: 5.0000
⚠️ Best value is near lower bound - consider extending range lower

Parameter: model__reg_alpha
Range: [0.0000, 5.69

In [9]:
# Create slice plots for each study
for name, study in loaded_studies.items():
    print(f"\nSlice plot for: {name}")
    fig = optuna.visualization.plot_slice(study)
    fig.update_layout(
        #height=800,
        #width=1200,
        title_text=f"Slice Plot - {extract_short_study_name(name)}",
        #title_x=0.5

    )
    fig.show()



Slice plot for: LightGBM_rfecv_all_nfeat_158_pca_count_loc_mape_v8_study



Slice plot for: LightGBM_rfecv_base_nfeat_56_nonpca_count_mape_v8_study



Slice plot for: LightGBM_rfecv_base_poi_pano_nfeat_158_nonpca_count_loc_mape_v8_study



Slice plot for: LightGBM_rfecv_base_img_nfeat_56_pca_count_50_mape_v8_study



Slice plot for: LightGBM_rfecv_base_img_nfeat_56_pca_count_mape_v8_study


In [10]:
# get best hyperparameters from a study

study_name = "LightGBM_rfecv_all_nfeat_158_pca_count_loc_mape_v6_study"
study = optuna.load_study(study_name=study_name, storage=STUDY_DB_PATH)
best_trial = study.best_trial

print(best_trial.params)

{'model__objective': 'regression_l2', 'model__learning_rate': 0.011887396180653893, 'model__n_estimators': 5400, 'model__num_leaves': 228, 'model__max_depth': 136, 'model__min_child_samples': 48, 'model__subsample': 0.8210028297163644, 'model__colsample_bytree': 0.6837500455490874, 'model__reg_alpha': 0.00035186259300270015, 'model__reg_lambda': 0.0006700023201929432}
