In [1]:
import pandas as pd
from smbox.utils import Logger
from smbox.optimise import Optimise
from smbox.smbox_config import smbox_params
from smbox.paramspace import rf_default_param_space
from smbox.default_objectives import rf_objective

In [2]:
# Configuration settings for the experiment.
# This dictionary holds key details for the setup, including dataset details, algorithm choice, search strategy, etc.
# Some keys enhance the clarity of logs and outputs, ensuring reproducibility and transparency in experiments.
global config
config = {
    'dataset_source': 'openml',               # Dataset's source platform; 'openml' in this instance.
    'dataset': 38,                            # Unique identifier for the dataset on OpenML.
    'algorithm': 'rf',                        # Chosen algorithm: Random Forest (denoted as 'rf').
    'search_strategy': 'smbox',               # Optimization/search strategy, specified as 'smbox'.
    'search_strategy_config': smbox_params,   # Configuration specifics for 'smbox'. Assumes `smbox_params` is predefined.
    'wallclock': 600,                         # Maximum time allotted for the task (600 seconds or 10 minutes).
    'output_root': './output/'                # Directory for saving output/results.
}

logger = Logger()
logger.log(f'Experiment Config: {config}', 'DEBUG')

# Set a fixed random seed for reproducibility across runs.
_random_seed = 42

In [3]:
def fetch_open_ml_data(dataset_id):
    """
    Fetches a dataset from OpenML based on a provided dataset ID.

    Parameters:
    - dataset_id (int): The identifier of the dataset on OpenML.

    Returns:
    - pd.DataFrame: The fetched dataset in a pandas DataFrame format.
    - str: The name of the target column, which is 'target' in this case.
    """
    import openml
    
    dataset = openml.datasets.get_dataset(dataset_id)
    print(dataset)

    X, y, categorical_indicator, attribute_names = dataset.get_data(
        dataset_format="array", target=dataset.default_target_attribute
    )
    df = pd.DataFrame(X, columns=attribute_names)
    df["target"] = y

    return df, 'target'

In [5]:
# Fetch the dataset from OpenML with ID 38.
df, target_name = fetch_open_ml_data(config['dataset'])

# Display the first few rows of the fetched dataset for a quick overview.
display(df.head())

# Extract the target column (labels) based on the 'target_name'.
y = df[target_name]

# Prepare the feature matrix (X) by removing the target column.
X = df.drop(target_name, axis=1)

# Handle any missing values in the dataset by replacing them with 0.
X.fillna(0, inplace=True)

OpenML Dataset
Name..........: sick
Version.......: 1
Format........: ARFF
Upload Date...: 2014-04-06 23:22:19
Licence.......: Public
Download URL..: https://api.openml.org/data/v1/download/38/sick.arff
OpenML URL....: https://www.openml.org/d/38
# of features.: 30
# of instances: 3772


Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source,target
0,41.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,125.0,0.0,1.14,0.0,109.0,0.0,,0.0,0
1,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,102.0,1.0,,1.0,,0.0,,1.0,0
2,46.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,109.0,0.0,0.91,0.0,120.0,0.0,,1.0,0
3,70.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,175.0,1.0,,1.0,,0.0,,1.0,0
4,70.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,61.0,0.0,0.87,0.0,70.0,0.0,,2.0,0


In [7]:
# Create a dictionary with training data. This format is needed for the SMBOX optimizer.
data = {"X_train": X, "y_train":y} 

# Use our default hyperparameter search space for a Random Forest algorithm.
cfg_schema = rf_default_param_space

logger.log(f'-------------Starting SMBOX')
logger.log(f'Initial configuration schema: {cfg_schema}', 'DEBUG')

# Initialize the optimization process with the given configuration and a specific random seed.
optimiser = Optimise(config, rf_objective, _random_seed)

# Use SMBOX to find the best hyperparameters for the model based on the provided data and search space.
best_parameters, best_perf = optimiser.SMBOXOptimise(data, cfg_schema)

2023-09-26 09:41:09: -------------Starting SMBOX
2023-09-26 09:41:09: Starting run for: 38, for 600 seconds
2023-09-26 09:41:32: Global best so far: 0.9631951758565462
2023-09-26 09:41:38: improvement: 0.027920885052071576
2023-09-26 09:41:38: Global best so far: 0.9911160609086178
2023-09-26 09:42:00: improvement: 0.0009919733751566628
2023-09-26 09:42:00: Global best so far: 0.9921080342837745
2023-09-26 09:43:05: improvement: 0.0015330720130961595
2023-09-26 09:43:05: Global best so far: 0.9936411062968706
2023-09-26 09:44:47: improvement: 0.0014203371267609288
2023-09-26 09:44:47: Global best so far: 0.9950614434236316
2023-09-26 09:50:52: improvement: 0.00017525620791802865
2023-09-26 09:50:52: Global best so far: 0.9952366996315496
2023-09-26 09:51:11: Global best: 0.9952366996315496
2023-09-26 09:51:11: Best params: {'max_features': 0.45876741314612923, 'n_estimators': 187, 'max_depth': 65, 'min_samples_leaf': 0.0019923439868258165, 'min_samples_split': 0.0033511392166982656, 'c

In [8]:
best_parameters, best_perf

({'max_features': 0.45876741314612923,
  'n_estimators': 187,
  'max_depth': 65,
  'min_samples_leaf': 0.0019923439868258165,
  'min_samples_split': 0.0033511392166982656,
  'class_weight': 'balanced_subsample'},
 0.9952366996315496)