In [None]:
import pandas as pd
import zipfile
import re
from sklearn.preprocessing import LabelEncoder
import sys
sys.path.insert(0, "./smbox")

from smbox.Utils import Logger
from smbox.Optimise import Optimise
from smbox.smbox_config import smbox_params
from smbox.ParamSpace import rf_default_param_space, xgb_default_param_space

pd.set_option('display.max_columns', None)

zip_file_path = './playground-series-s3e17.zip'
with zipfile.ZipFile(zip_file_path, 'r') as z:
    
    # Open the CSV file contained in the ZIP archive
    with z.open('train.csv') as f:
        # Read the CSV file into a pandas DataFrame
        df_train = pd.read_csv(f)
        
    with z.open('test.csv') as f:
        df_test = pd.read_csv(f)

In [None]:
print(df_train.shape)
display(df_train.head(2))

In [None]:
def preprocess_data(df_input, label_encode_col, one_hot_encode_col):
    """
    Preprocess DataFrame.
    
    :param df_input: The input DataFrame
    :param label_encode_col: The name of the column to be label encoded
    :param one_hot_encode_col: The name of the column to be one-hot encoded
    :return: The modified DataFrame with the encoded features
    """
    df = df_input.copy()
    
    df.drop('id', axis=1, inplace=True)
    
    # Remove or replace forbidden characters
    df.columns = [re.sub(r'[<>\[\]]', '', col) for col in df.columns]

    # Instantiate labelencoder object
    labelencoder = LabelEncoder()
    
    # Apply LabelEncoder on the specified column
    encoded_col_name = label_encode_col + '_encoded'
    df[encoded_col_name] = labelencoder.fit_transform(df[label_encode_col])
    
    # Drop the original label encoding column from the DataFrame
    df.drop(label_encode_col, axis=1, inplace=True)
    
    # Perform one-hot encoding on the specified column
    one_hot = pd.get_dummies(df[one_hot_encode_col], prefix=one_hot_encode_col)
    
    
    # Concatenate the original DataFrame with the one-hot encoded columns
    df = pd.concat([df, one_hot], axis=1)
    
    # Drop the original one-hot encoding column from the DataFrame
    df.drop(one_hot_encode_col, axis=1, inplace=True)
    
    return df

In [None]:
# Use the function on df_train
# Drop the original label encoding column from the DataFrame

df_train_preprocessed = preprocess_data(df_train, 'Product ID', 'Type')
df_test_preprocessed = preprocess_data(df_test, 'Product ID', 'Type')

print(f'Columns: {list(df_train_preprocessed.columns)}')
target_name = 'Machine failure'
y_train = df_train_preprocessed[target_name]
X_train = df_train_preprocessed.drop(target_name, axis=1)
X_train.fillna(0, inplace=True)

In [None]:
logger = Logger()

##---- smbox config
# Define a configuration dict to hold all key information
global config
config = {'dataset_source': 'openml'
    , 'dataset': 'playground-series-s3e17'
    , 'algorithm': 'rf'
    , 'search_strategy': 'smbox'
    , 'search_strategy_config': smbox_params
    , 'wallclock': 3600
    , 'output_root': '/Users/salhit/development/smbox/smbox/test/resources/output/'
          }
logger.log(f'Experiment Config: {config}')
##----

data = {"X_train": X_train, "y_train":y_train} # requried data format for SMBOX

# use default rf hperparameter search space
if config['algorithm'] == 'rf':
    cfg_schema = rf_default_param_space
    # update param space for the max_depth variable
    cfg_schema['tune']['max_depth']['max'] = 250
elif config['algorithm'] == 'xgb':
    cfg_schema = xgb_default_param_space
    # update the default scale_pos_weight param
    classes = y_train.value_counts()
    class_0 = min(classes.index.values)
    class_1 = max(classes.index.values)
    balance_ratio = round(classes[class_0] / classes[class_1], 2)
    cfg_schema['fix']['scale_pos_weight'] = balance_ratio


logger.log(f'-------------Starting SMBOX')
logger.log(f'Initial configuration schema: {cfg_schema}')

optimiser = Optimise(config, random_seed=42)
best_parameters = optimiser.SMBOXOptimise(data, cfg_schema)

## Geneate predictions on the test set using the best found params

In [None]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

if config['algorithm'] == 'rf':
    model = RandomForestClassifier(**best_parameters, random_state=42)
elif config['algorithm'] == 'xgb':
    model = XGBClassifier(**best_parameters, random_state=42)

model.fit(X_train, y_train)

In [None]:
df_test['Machine failure'] = model.predict_proba(df_test_preprocessed)[:, 1] 
df_submission = df_test[['id', 'Machine failure']]

In [None]:
from datetime import datetime

timestamp = datetime.now().strftime("%Y_%m_%d_%H%M%S")
output_path = f"submission_{config['algorithm']}_{timestamp}.csv"

df_submission.to_csv(output_path, index=False)