In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, Normalizer, RobustScaler, OneHotEncoder, TargetEncoder
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import optuna

In [3]:
#get data
train_clean = pd.read_csv('../prediction-task/train-clean.csv')
test_clean = pd.read_csv('../prediction-task/test-clean.csv')

In [7]:
#ensure cat cols are understood to be cat
for cat_col in ['MODE', 'POWER']:
    train_clean[cat_col] = train_clean[cat_col].astype('category')
    test_clean[cat_col] =  test_clean[cat_col].astype('category')

In [8]:
#get training features & target
X = train_clean.drop('OUTPUT', axis='columns')
y = train_clean['OUTPUT']

In [9]:
#split data into train and val sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=1)

In [10]:
num_cols = ['AMPS', 'VOLTS', 'TEMP', 'DELTA', 'GAMMA']
cat_cols = ['POWER', 'MODE']

#custom func to calculate rmse
def get_rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

#preprocessing funcs
def get_num_transformer(scaler_type):
    if scaler_type == 'StandardScaler':
        num_transformer = Pipeline(steps=[('scaler', StandardScaler())])
    elif scaler_type == 'MinMaxScaler':
        num_transformer = Pipeline(steps=[('scaler', MinMaxScaler())])
    elif scaler_type == 'MaxAbsScaler':
        num_transformer = Pipeline(steps=[('scaler', MaxAbsScaler())])
    elif scaler_type == 'Normalizer':
        num_transformer = Pipeline(steps=[('scaler', Normalizer())])
    elif scaler_type == 'RobustScaler':
        num_transformer = Pipeline(steps=[('scaler', RobustScaler())])
    return num_transformer

def get_cat_transformer(encoder_type):
    if encoder_type == 'OneHotEncoder':
        cat_transformer = Pipeline(steps=[('encoder', OneHotEncoder())])
    elif encoder_type == 'TargetEncoder':
        cat_transformer = Pipeline(steps=[('encoder', TargetEncoder())])
    return cat_transformer

def get_preprocessor(scaler_type, encoder_type, num_cols, cat_cols):
    preprocessor = ColumnTransformer(
        transformers=[
            ('scale', get_num_transformer(scaler_type), num_cols),
            ('encode', get_cat_transformer(encoder_type), cat_cols)
        ])
    return preprocessor

#objective function for hyperparam tuning
def objective(trial):
    #tune preprocessing params
    scaler_type = trial.suggest_categorical('scaler_type', ['StandardScaler', 'MinMaxScaler', 'MaxAbsScaler', 'Normalizer', 'RobustScaler'])
    encoder_type = trial.suggest_categorical('encoder_type', ['OneHotEncoder', 'TargetEncoder'])
    preprocessor = get_preprocessor(scaler_type, encoder_type, num_cols, cat_cols)
        
    #tune model type
    regressor_type = trial.suggest_categorical('regressor_type', ['XGBRegressor', 'RandomForestRegressor'])
    
    if regressor_type == 'XGBRegressor':
        #tune modelling params
        regressor_params = {
            'max_depth': trial.suggest_int('max_depth', 1, 10),  #shallow trees bc gradient boosting
            'n_estimators': trial.suggest_int('n_estimators', 20, 1000),
            'subsample': trial.suggest_float('subsample', 0.5, 1),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1),
            'gamma': trial.suggest_float('gamma', 0, 0.5),
            'reg_alpha': trial.suggest_categorical('reg_alpha', [0, 0.001, 0.005, 0.01, 0.05]),
            'reg_lambda': trial.suggest_categorical('reg_lambda', [0, 0.001, 0.005, 0.01, 0.05]),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 50),
            'learning_rate': 0.1
        }
        regressor = XGBRegressor(**regressor_params)

    elif regressor_type == 'RandomForestRegressor':
        #tune modelling params
        regressor_params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 3000),
            'max_depth': trial.suggest_int('max_depth', 5, 50),  #deep trees bc rf
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 15),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 14),
            'max_features': trial.suggest_categorical('max_features', [None, 'sqrt', 'log2'])
        }
        regressor = RandomForestRegressor(**regressor_params)

    #full modelling pipeline
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                ('regressor', regressor)])
    
    #create rmse scorer from custom rmse func
    rmse_scorer = make_scorer(get_rmse)

    #get cv rmse
    all_rmse = cross_val_score(pipeline, X_train, y_train, scoring=rmse_scorer, n_jobs=-1, cv=5)
    mean_rmse = all_rmse.mean()
    return mean_rmse

#tune hyperparams using optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

[I 2023-12-15 16:49:01,698] A new study created in memory with name: no-name-e3499e69-d57f-4cf5-896d-31a7be7b508d
[I 2023-12-15 16:49:07,312] Trial 0 finished with value: 1.0760568833836626 and parameters: {'scaler_type': 'MinMaxScaler', 'encoder_type': 'OneHotEncoder', 'regressor_type': 'RandomForestRegressor', 'n_estimators': 2858, 'max_depth': 30, 'min_samples_split': 4, 'min_samples_leaf': 12, 'max_features': None}. Best is trial 0 with value: 1.0760568833836626.
[I 2023-12-15 16:49:07,863] Trial 1 finished with value: 1.1522878219282713 and parameters: {'scaler_type': 'StandardScaler', 'encoder_type': 'OneHotEncoder', 'regressor_type': 'XGBRegressor', 'max_depth': 7, 'n_estimators': 570, 'subsample': 0.5495344213158437, 'colsample_bytree': 0.579395998891445, 'gamma': 0.19398055805783082, 'reg_alpha': 0.01, 'reg_lambda': 0, 'min_child_weight': 49}. Best is trial 0 with value: 1.0760568833836626.
[I 2023-12-15 16:49:08,790] Trial 2 finished with value: 0.9694698747219596 and paramet

In [13]:
#get best hyperparams found
best_params = study.best_trial.params
best_params

{'scaler_type': 'StandardScaler',
 'encoder_type': 'OneHotEncoder',
 'regressor_type': 'RandomForestRegressor',
 'n_estimators': 1931,
 'max_depth': 26,
 'min_samples_split': 3,
 'min_samples_leaf': 1,
 'max_features': None}

In [14]:
#get best modelling pipeline
best_preprocessor = get_preprocessor(best_params['scaler_type'], best_params['encoder_type'], num_cols, cat_cols)
del best_params['scaler_type']
del best_params['encoder_type']

if best_params['regressor_type'] == 'XGBRegressor':
    del best_params['regressor_type']
    best_params['learning_rate'] = 0.1
    best_regressor = XGBRegressor(**best_params) 
elif best_params['regressor_type'] == 'RandomForestRegressor':
    del best_params['regressor_type']
    best_regressor = RandomForestRegressor(**best_params)

best_pipeline = Pipeline(steps=[('preprocessor', best_preprocessor),
                                ('regressor', best_regressor)])

In [15]:
best_pipeline.fit(X_train, y_train)

In [16]:
#evaluate best model on val set
predictions = best_pipeline.predict(X_val)
val_rmse = get_rmse(y_val, predictions)
print(val_rmse)

0.5048696741536496


In [17]:
#get predictions on test set

In [18]:
predictions = best_pipeline.predict(test_clean)

In [19]:
predictions = pd.DataFrame(predictions)
predictions.to_csv('../test_preds.csv', header=None, index=False)

In [20]:
predictions

Unnamed: 0,0
0,2.305053
1,2.164949
2,2.072999
3,1.987497
4,2.135739
...,...
2495,2.518213
2496,2.428236
2497,1.967197
2498,2.092365
