In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from preprocessing_utils import (
    identify_column_types,
    standardize_features,
    fit_impute_item_weight,
    transform_impute_item_weight,
    fit_mice_imputation,
    transform_mice_imputation,
    create_features,
    detect_and_remove_outliers,
    transform_features,
    fit_scale_numerical_features,
    transform_scale_numerical_features,
    fit_encode_categorical_features,
    transform_encode_categorical_features
)


print("Loading datasets...")
train_df = pd.read_csv('train_v9rqX0R.csv')
test_df = pd.read_csv('test_AbJTz2l.csv')

print("Processing training set...")
# Step 1: Initial preprocessing
numerical_cols, categorical_cols = identify_column_types(train_df)
train_df = standardize_features(train_df)

print(train_df.shape)
original_columns = set(train_df.columns.to_list())



Loading datasets...
Processing training set...
(8523, 13)


In [2]:

# Step 2: Weight imputation
train_df, item_weights = fit_impute_item_weight(train_df)



# Identify missing columns
missing_columns = original_columns - set(train_df.columns.to_list())
print(train_df.shape)
print("Missing columns:", missing_columns)


(8523, 13)
Missing columns: set()


In [3]:
# Step 3: MICE imputation
selected_columns = [
    "Item_Identifier", "Item_Weight", "Item_Fat_Content", "Item_Visibility",
    "Item_Type", "Item_MRP", "Outlet_Identifier", "Outlet_Establishment_Year",
    "Outlet_Size", "Outlet_Location_Type", "Outlet_Type","Item_Type_Grouped"]
categorical_vars = selected_columns
train_df, mice_mappings = fit_mice_imputation(train_df, selected_columns, categorical_vars)

# Identify missing columns
missing_columns = original_columns - set(train_df.columns.to_list())
print(train_df.shape)
print("Missing columns:", missing_columns)



(8523, 13)
Missing columns: set()


In [4]:
# Step 4: Feature engineering
train_df = create_features(train_df)
train_df = transform_features(train_df)


# Identify missing columns
missing_columns = original_columns - set(train_df.columns.to_list())
print(train_df.shape)
print("Missing columns:", missing_columns)


(8523, 18)
Missing columns: set()


In [5]:
# Step 5: Outlier detection and removal
numerical_cols, categorical_cols= identify_column_types(train_df)
train_df = detect_and_remove_outliers(train_df, test_df, numerical_cols)

# Identify missing columns
missing_columns = original_columns - set(train_df.columns.to_list())
print(train_df.shape)
print("Missing columns:", missing_columns)

(8157, 18)
Missing columns: set()


In [6]:
# Step 6: Scaling
numerical_cols, categorical_cols= identify_column_types(train_df)
train_df, scaler = fit_scale_numerical_features(train_df, numerical_cols)

# Identify missing columns
missing_columns = original_columns - set(train_df.columns.to_list())
print(train_df.shape)
print("Missing columns:", missing_columns)

(8157, 18)
Missing columns: set()


In [7]:
# Step 7: Encoding
train_df, encoder = fit_encode_categorical_features(train_df)

# Identify missing columns
missing_columns = original_columns - set(train_df.columns.to_list())
print(train_df.shape)
print("Missing columns:", missing_columns)

(8157, 18)
Missing columns: set()


In [8]:
train_df.columns.to_list()

['Item_Identifier',
 'Item_Weight',
 'Item_Fat_Content',
 'Item_Visibility',
 'Item_Type',
 'Item_MRP',
 'Outlet_Identifier',
 'Outlet_Establishment_Year',
 'Outlet_Size',
 'Outlet_Location_Type',
 'Outlet_Type',
 'Item_Outlet_Sales',
 'Item_Type_Grouped',
 'Outlet_Age',
 'Establishment_Decade',
 'Outlet_Age_Category',
 'Price_per_Unit_Weight',
 'Item_MRP_Binned']

In [9]:
print("\nProcessing test set...")
# Process test set using fitted transformations but without outlier removal
test_df = standardize_features(test_df)
test_df = transform_impute_item_weight(test_df, item_weights)
numerical_cols, categorical_cols = identify_column_types(test_df)
test_df = transform_mice_imputation(test_df, selected_columns, categorical_vars, mice_mappings)
test_df = create_features(test_df)
test_df = transform_features(test_df)
numerical_cols, categorical_cols = identify_column_types(test_df)
test_df = transform_scale_numerical_features(test_df, numerical_cols, scaler)
test_df = transform_encode_categorical_features(test_df, encoder)

print("\nSaving processed datasets...")
train_df.to_csv('processed_train.csv', index=False)
test_df.to_csv('processed_test.csv', index=False)
print("Done! Processed datasets saved as 'processed_train.csv' and 'processed_test.csv'")


Processing test set...

Saving processed datasets...
Done! Processed datasets saved as 'processed_train.csv' and 'processed_test.csv'


In [28]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_squared_error # compute RMSE by utilizing the mean_squared_error function with the squared parameter set to False
from optuna.samplers import TPESampler
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")


# =============================================================================
# Model Training and Evaluation
# =============================================================================

training_data = pd.read_csv('processed_train.csv')
testing_data = pd.read_csv('processed_test.csv')


print(training_data.columns.tolist())

# Define features and target
X = training_data.drop(columns=['Item_Outlet_Sales','Item_Identifier'])
y = training_data['Item_Outlet_Sales']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



['Item_Identifier', 'Item_Weight', 'Item_Fat_Content', 'Item_Visibility', 'Item_Type', 'Item_MRP', 'Outlet_Identifier', 'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type', 'Item_Outlet_Sales', 'Item_Type_Grouped', 'Outlet_Age', 'Establishment_Decade', 'Outlet_Age_Category', 'Price_per_Unit_Weight', 'Item_MRP_Binned']


In [29]:
import sklearn
import xgboost as xgb
import lightgbm as lgb
import catboost

print("Scikit-Learn Version:", sklearn.__version__)
print("XGBoost Version:", xgb.__version__)
print("LightGBM Version:", lgb.__version__)
print("CatBoost Version:", catboost.__version__)


Scikit-Learn Version: 1.5.2
XGBoost Version: 2.1.3
LightGBM Version: 4.5.0
CatBoost Version: 1.2.7


In [31]:
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import RandomizedSearchCV
from tqdm import tqdm
import numpy as np

# Define the models
models = {
    "RandomForest": RandomForestRegressor(),
    "XGBoost": xgb.XGBRegressor(objective="reg:squarederror"),
    "LightGBM": lgb.LGBMRegressor(),
    "CatBoost": CatBoostRegressor(verbose=0),
    "ExtraTrees": ExtraTreesRegressor()
}

# Define hyperparameter grids
param_grids = {
    "RandomForest": {
        "n_estimators": [50, 100, 200, 500],
        "max_depth": [None, 10, 20, 30],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4]
    },
    "XGBoost": {
        "n_estimators": [100, 200, 300],
        "max_depth": [3, 6, 9],
        "learning_rate": [0.01, 0.1, 0.2],
        "subsample": [0.7, 0.8, 1.0]
    },
    "LightGBM": {
        "n_estimators": [100, 200, 300],
        "learning_rate": [0.01, 0.1, 0.2],
        "num_leaves": [20, 30, 40],
        "max_depth": [-1, 10, 20]
    },
    "CatBoost": {
        "iterations": [100, 200, 300],
        "depth": [4, 6, 10],
        "learning_rate": [0.01, 0.05, 0.1],
        "l2_leaf_reg": [3, 5, 7]
    },
    "ExtraTrees": {
        "n_estimators": [50, 100, 200, 500],
        "max_depth": [None, 10, 20, 30],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4]
    }
}

# Store results in DataFrame
results = []

for model_name in tqdm(models, desc="Running RandomizedSearchCV"):
    print(f"\nRunning RandomizedSearchCV for {model_name}...")
    search = RandomizedSearchCV(
        models[model_name], 
        param_distributions=param_grids[model_name], 
        n_iter=20, 
        cv=3, 
        scoring="neg_mean_squared_error", 
        random_state=42, 
        n_jobs=-1
    )
    search.fit(X_train, y_train)
    
    best_model = search.best_estimator_
    best_params = search.best_params_
    preds = best_model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))

    results.append({"Model": model_name, "Best Params": best_params, "RMSE": rmse})

# Convert results to DataFrame and save
df_results_random = pd.DataFrame(results)



df_results_random

Running RandomizedSearchCV:   0%|          | 0/5 [00:00<?, ?it/s]


Running RandomizedSearchCV for RandomForest...


Running RandomizedSearchCV:  20%|██        | 1/5 [00:23<01:33, 23.46s/it]


Running RandomizedSearchCV for XGBoost...


Running RandomizedSearchCV:  40%|████      | 2/5 [00:25<00:32, 10.77s/it]


Running RandomizedSearchCV for LightGBM...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004564 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1047
[LightGBM] [Info] Number of data points in the train set: 4350, number of used features: 16
[LightGBM] [Info] Start training from score 7.398664
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006239 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1050
[LightGBM] [Info] Number of data points in the train set: 4350, number of used features: 16
[LightGBM] [Info] Start training from score 7.404416
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005938 seconds.
You can set `force_row_wise=true` to remov

Running RandomizedSearchCV:  60%|██████    | 3/5 [01:13<00:55, 27.79s/it]


Running RandomizedSearchCV for CatBoost...


Running RandomizedSearchCV:  80%|████████  | 4/5 [01:18<00:18, 18.93s/it]


Running RandomizedSearchCV for ExtraTrees...


Running RandomizedSearchCV: 100%|██████████| 5/5 [01:27<00:00, 17.50s/it]


Unnamed: 0,Model,Best Params,RMSE
0,RandomForest,"{'n_estimators': 100, 'min_samples_split': 5, ...",0.535343
1,XGBoost,"{'subsample': 0.8, 'n_estimators': 100, 'max_d...",0.525609
2,LightGBM,"{'num_leaves': 20, 'n_estimators': 100, 'max_d...",0.531067
3,CatBoost,"{'learning_rate': 0.05, 'l2_leaf_reg': 3, 'ite...",0.523341
4,ExtraTrees,"{'n_estimators': 100, 'min_samples_split': 5, ...",0.530721


In [32]:
df_results_random.to_csv("random_search_results.csv")
display(df_results_random)

Unnamed: 0,Model,Best Params,RMSE
0,RandomForest,"{'n_estimators': 100, 'min_samples_split': 5, ...",0.535343
1,XGBoost,"{'subsample': 0.8, 'n_estimators': 100, 'max_d...",0.525609
2,LightGBM,"{'num_leaves': 20, 'n_estimators': 100, 'max_d...",0.531067
3,CatBoost,"{'learning_rate': 0.05, 'l2_leaf_reg': 3, 'ite...",0.523341
4,ExtraTrees,"{'n_estimators': 100, 'min_samples_split': 5, ...",0.530721


In [26]:
import optuna
from tqdm import tqdm

# Store results
results_optuna = []

n_trials = 50
pbar = tqdm(total=n_trials, desc="Optimizing")

def objective(trial):
    model_name = trial.suggest_categorical("model", ["RandomForest", "XGBoost", "LightGBM", "CatBoost", "ExtraTrees"])
    
    if model_name == "RandomForest":
        model = RandomForestRegressor(
            n_estimators=trial.suggest_int("n_estimators", 50, 500),
            max_depth=trial.suggest_int("max_depth", 1, 30), 
            min_samples_split=trial.suggest_int("min_samples_split", 2, 10),
            min_samples_leaf=trial.suggest_int("min_samples_leaf", 1, 4),
            random_state=42
        )
    
    elif model_name == "XGBoost":
        model = xgb.XGBRegressor(
            n_estimators=trial.suggest_int("n_estimators", 100, 300),
            max_depth=trial.suggest_int("max_depth", 3, 9),  
            learning_rate=trial.suggest_float("learning_rate", 0.01, 0.2),
            subsample=trial.suggest_float("subsample", 0.7, 1.0),
            objective="reg:squarederror",
            random_state=42
        )
    
    elif model_name == "LightGBM":
        model = lgb.LGBMRegressor(
            n_estimators=trial.suggest_int("n_estimators", 100, 300),
            learning_rate=trial.suggest_float("learning_rate", 0.01, 0.2),
            num_leaves=trial.suggest_int("num_leaves", 20, 40),
            max_depth=trial.suggest_int("max_depth", -1, 20),  
            random_state=42
        )
    
    elif model_name == "CatBoost":
        model = CatBoostRegressor(
            iterations=trial.suggest_int("iterations", 100, 300),
            depth=trial.suggest_int("depth", 4, 10),
            learning_rate=trial.suggest_float("learning_rate", 0.01, 0.1),
            l2_leaf_reg=trial.suggest_int("l2_leaf_reg", 3, 7),
            random_state=42,
            verbose=0
        )
    
    elif model_name == "ExtraTrees":
        model = ExtraTreesRegressor(
            n_estimators=trial.suggest_int("n_estimators", 50, 500),
            max_depth=trial.suggest_int("max_depth", 1, 30), 
            min_samples_split=trial.suggest_int("min_samples_split", 2, 10),
            min_samples_leaf=trial.suggest_int("min_samples_leaf", 1, 4),
            random_state=42
        )

    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))

    results_optuna.append({"Model": model_name, "Best Params": trial.params, "RMSE": rmse})
    pbar.update(1)

    return rmse

study = optuna.create_study(direction="minimize", sampler=TPESampler())
study.optimize(objective, n_trials=n_trials)

pbar.close()

# Convert results to DataFrame and save
df_results_optuna = pd.DataFrame(results_optuna)

df_results_optuna


Optimizing:   0%|          | 0/50 [00:00<?, ?it/s][I 2025-02-03 00:33:16,381] A new study created in memory with name: no-name-b972b764-0a85-4bf0-8711-299b12879763
Optimizing:   2%|▏         | 1/50 [00:00<00:10,  4.57it/s][I 2025-02-03 00:33:16,600] Trial 0 finished with value: 2834.890655102435 and parameters: {'model': 'CatBoost', 'iterations': 187, 'depth': 6, 'learning_rate': 0.015199949732119265, 'l2_leaf_reg': 7}. Best is trial 0 with value: 2834.890655102435.
Optimizing:   4%|▍         | 2/50 [00:05<02:24,  3.02s/it][I 2025-02-03 00:33:21,574] Trial 1 finished with value: 2834.858759703403 and parameters: {'model': 'RandomForest', 'n_estimators': 316, 'max_depth': 12, 'min_samples_split': 5, 'min_samples_leaf': 4}. Best is trial 1 with value: 2834.858759703403.
Optimizing:   6%|▌         | 3/50 [00:07<02:15,  2.89s/it][I 2025-02-03 00:33:24,317] Trial 2 finished with value: 2834.858118344438 and parameters: {'model': 'RandomForest', 'n_estimators': 231, 'max_depth': 8, 'min_samp

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000478 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1050
[LightGBM] [Info] Number of data points in the train set: 5709, number of used features: 16
[LightGBM] [Info] Start training from score 7.399722


Optimizing:   8%|▊         | 4/50 [00:08<01:26,  1.89s/it][I 2025-02-03 00:33:24,669] Trial 3 finished with value: 2834.860007048439 and parameters: {'model': 'LightGBM', 'n_estimators': 198, 'learning_rate': 0.08428022960699107, 'num_leaves': 32, 'max_depth': 7}. Best is trial 2 with value: 2834.858118344438.




Optimizing:  10%|█         | 5/50 [00:10<01:25,  1.90s/it][I 2025-02-03 00:33:26,578] Trial 4 finished with value: 2834.858151390057 and parameters: {'model': 'ExtraTrees', 'n_estimators': 260, 'max_depth': 29, 'min_samples_split': 7, 'min_samples_leaf': 4}. Best is trial 2 with value: 2834.858118344438.
Optimizing:  12%|█▏        | 6/50 [00:10<00:59,  1.35s/it][I 2025-02-03 00:33:26,860] Trial 5 finished with value: 2834.858512636126 and parameters: {'model': 'XGBoost', 'n_estimators': 140, 'max_depth': 7, 'learning_rate': 0.13205357573150747, 'subsample': 0.8498164377686344}. Best is trial 2 with value: 2834.858118344438.
[I 2025-02-03 00:33:26,945] Trial 6 finished with value: 2834.8598569709584 and parameters: {'model': 'XGBoost', 'n_estimators': 113, 'max_depth': 4, 'learning_rate': 0.14615667483996347, 'subsample': 0.9942845808695122}. Best is trial 2 with value: 2834.858118344438.
Optimizing:  16%|█▌        | 8/50 [00:10<00:31,  1.33it/s][I 2025-02-03 00:33:27,146] Trial 7 finis

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000333 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1050
[LightGBM] [Info] Number of data points in the train set: 5709, number of used features: 16
[LightGBM] [Info] Start training from score 7.399722
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000391 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1050
[LightGBM] [Info] Number of data points in the train set: 5709, number of used features: 16
[LightGBM] [Info] Start training from score 7.399722


Optimizing:  18%|█▊        | 9/50 [00:11<00:25,  1.60it/s][I 2025-02-03 00:33:27,400] Trial 8 finished with value: 2834.8589143130885 and parameters: {'model': 'LightGBM', 'n_estimators': 177, 'learning_rate': 0.17828810351259547, 'num_leaves': 24, 'max_depth': 7}. Best is trial 2 with value: 2834.858118344438.




Optimizing:  20%|██        | 10/50 [00:14<00:50,  1.27s/it][I 2025-02-03 00:33:30,398] Trial 9 finished with value: 2834.8609788195836 and parameters: {'model': 'RandomForest', 'n_estimators': 321, 'max_depth': 6, 'min_samples_split': 10, 'min_samples_leaf': 3}. Best is trial 2 with value: 2834.858118344438.
Optimizing:  22%|██▏       | 11/50 [00:25<02:41,  4.14s/it][I 2025-02-03 00:33:41,963] Trial 10 finished with value: 2834.8654274793385 and parameters: {'model': 'RandomForest', 'n_estimators': 477, 'max_depth': 22, 'min_samples_split': 2, 'min_samples_leaf': 1}. Best is trial 2 with value: 2834.858118344438.
Optimizing:  24%|██▍       | 12/50 [00:28<02:26,  3.86s/it][I 2025-02-03 00:33:45,118] Trial 11 finished with value: 2834.858337219715 and parameters: {'model': 'ExtraTrees', 'n_estimators': 393, 'max_depth': 29, 'min_samples_split': 7, 'min_samples_leaf': 2}. Best is trial 2 with value: 2834.858118344438.
Optimizing:  26%|██▌       | 13/50 [00:30<02:02,  3.31s/it][I 2025-02-0

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000451 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1050
[LightGBM] [Info] Number of data points in the train set: 5709, number of used features: 16
[LightGBM] [Info] Start training from score 7.399722


Optimizing:  62%|██████▏   | 31/50 [01:16<00:31,  1.66s/it][I 2025-02-03 00:34:32,801] Trial 30 finished with value: 2834.85589402504 and parameters: {'model': 'LightGBM', 'n_estimators': 232, 'learning_rate': 0.1231306621841067, 'num_leaves': 40, 'max_depth': 15}. Best is trial 30 with value: 2834.85589402504.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000658 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1050
[LightGBM] [Info] Number of data points in the train set: 5709, number of used features: 16
[LightGBM] [Info] Start training from score 7.399722


Optimizing:  64%|██████▍   | 32/50 [01:16<00:23,  1.28s/it][I 2025-02-03 00:34:33,199] Trial 31 finished with value: 2834.8545021629634 and parameters: {'model': 'LightGBM', 'n_estimators': 232, 'learning_rate': 0.12486707466524277, 'num_leaves': 40, 'max_depth': 10}. Best is trial 31 with value: 2834.8545021629634.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000480 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1050
[LightGBM] [Info] Number of data points in the train set: 5709, number of used features: 16
[LightGBM] [Info] Start training from score 7.399722


Optimizing:  66%|██████▌   | 33/50 [01:17<00:17,  1.01s/it][I 2025-02-03 00:34:33,580] Trial 32 finished with value: 2834.8566261648516 and parameters: {'model': 'LightGBM', 'n_estimators': 228, 'learning_rate': 0.11997611736244054, 'num_leaves': 40, 'max_depth': 10}. Best is trial 31 with value: 2834.8545021629634.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000460 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1050
[LightGBM] [Info] Number of data points in the train set: 5709, number of used features: 16
[LightGBM] [Info] Start training from score 7.399722


Optimizing:  68%|██████▊   | 34/50 [01:17<00:13,  1.20it/s][I 2025-02-03 00:34:33,992] Trial 33 finished with value: 2834.856918210143 and parameters: {'model': 'LightGBM', 'n_estimators': 218, 'learning_rate': 0.11981759476462739, 'num_leaves': 40, 'max_depth': 10}. Best is trial 31 with value: 2834.8545021629634.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000642 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1050
[LightGBM] [Info] Number of data points in the train set: 5709, number of used features: 16
[LightGBM] [Info] Start training from score 7.399722


Optimizing:  70%|███████   | 35/50 [01:18<00:10,  1.40it/s][I 2025-02-03 00:34:34,425] Trial 34 finished with value: 2834.8577350892647 and parameters: {'model': 'LightGBM', 'n_estimators': 214, 'learning_rate': 0.1194179714039603, 'num_leaves': 40, 'max_depth': 10}. Best is trial 31 with value: 2834.8545021629634.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000544 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1050
[LightGBM] [Info] Number of data points in the train set: 5709, number of used features: 16
[LightGBM] [Info] Start training from score 7.399722


Optimizing:  72%|███████▏  | 36/50 [01:18<00:09,  1.46it/s][I 2025-02-03 00:34:35,048] Trial 35 finished with value: 2834.859159783109 and parameters: {'model': 'LightGBM', 'n_estimators': 246, 'learning_rate': 0.10188970648822973, 'num_leaves': 40, 'max_depth': 14}. Best is trial 31 with value: 2834.8545021629634.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001313 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1050
[LightGBM] [Info] Number of data points in the train set: 5709, number of used features: 16
[LightGBM] [Info] Start training from score 7.399722


Optimizing:  74%|███████▍  | 37/50 [01:19<00:07,  1.66it/s][I 2025-02-03 00:34:35,449] Trial 36 finished with value: 2834.8561445973673 and parameters: {'model': 'LightGBM', 'n_estimators': 198, 'learning_rate': 0.14816887272194845, 'num_leaves': 40, 'max_depth': 10}. Best is trial 31 with value: 2834.8545021629634.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000500 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1050
[LightGBM] [Info] Number of data points in the train set: 5709, number of used features: 16
[LightGBM] [Info] Start training from score 7.399722


Optimizing:  76%|███████▌  | 38/50 [01:19<00:06,  1.88it/s][I 2025-02-03 00:34:35,821] Trial 37 finished with value: 2834.857777396472 and parameters: {'model': 'LightGBM', 'n_estimators': 194, 'learning_rate': 0.15415476898941682, 'num_leaves': 36, 'max_depth': 15}. Best is trial 31 with value: 2834.8545021629634.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001006 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1050
[LightGBM] [Info] Number of data points in the train set: 5709, number of used features: 16
[LightGBM] [Info] Start training from score 7.399722


Optimizing:  78%|███████▊  | 39/50 [01:19<00:05,  1.99it/s][I 2025-02-03 00:34:36,256] Trial 38 finished with value: 2834.8561529983403 and parameters: {'model': 'LightGBM', 'n_estimators': 243, 'learning_rate': 0.1511827875857716, 'num_leaves': 36, 'max_depth': 12}. Best is trial 31 with value: 2834.8545021629634.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000901 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1050
[LightGBM] [Info] Number of data points in the train set: 5709, number of used features: 16
[LightGBM] [Info] Start training from score 7.399722


Optimizing:  80%|████████  | 40/50 [01:20<00:04,  2.14it/s][I 2025-02-03 00:34:36,640] Trial 39 finished with value: 2834.858498083239 and parameters: {'model': 'LightGBM', 'n_estimators': 247, 'learning_rate': 0.15593864084199704, 'num_leaves': 35, 'max_depth': 13}. Best is trial 31 with value: 2834.8545021629634.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000496 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1050
[LightGBM] [Info] Number of data points in the train set: 5709, number of used features: 16
[LightGBM] [Info] Start training from score 7.399722


Optimizing:  82%|████████▏ | 41/50 [01:20<00:03,  2.33it/s][I 2025-02-03 00:34:36,978] Trial 40 finished with value: 2834.858303075865 and parameters: {'model': 'LightGBM', 'n_estimators': 189, 'learning_rate': 0.14260473166049825, 'num_leaves': 36, 'max_depth': 9}. Best is trial 31 with value: 2834.8545021629634.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000292 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1050
[LightGBM] [Info] Number of data points in the train set: 5709, number of used features: 16
[LightGBM] [Info] Start training from score 7.399722


Optimizing:  84%|████████▍ | 42/50 [01:21<00:03,  2.32it/s][I 2025-02-03 00:34:37,412] Trial 41 finished with value: 2834.8598894810793 and parameters: {'model': 'LightGBM', 'n_estimators': 222, 'learning_rate': 0.10845525428974565, 'num_leaves': 40, 'max_depth': 11}. Best is trial 31 with value: 2834.8545021629634.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000742 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1050
[LightGBM] [Info] Number of data points in the train set: 5709, number of used features: 16
[LightGBM] [Info] Start training from score 7.399722


Optimizing:  86%|████████▌ | 43/50 [01:21<00:02,  2.40it/s][I 2025-02-03 00:34:37,799] Trial 42 finished with value: 2834.8599035688085 and parameters: {'model': 'LightGBM', 'n_estimators': 207, 'learning_rate': 0.1307818363926385, 'num_leaves': 37, 'max_depth': 9}. Best is trial 31 with value: 2834.8545021629634.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001187 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1050
[LightGBM] [Info] Number of data points in the train set: 5709, number of used features: 16
[LightGBM] [Info] Start training from score 7.399722


Optimizing:  88%|████████▊ | 44/50 [01:21<00:02,  2.24it/s][I 2025-02-03 00:34:38,316] Trial 43 finished with value: 2834.852518217115 and parameters: {'model': 'LightGBM', 'n_estimators': 242, 'learning_rate': 0.168189208887246, 'num_leaves': 38, 'max_depth': 11}. Best is trial 43 with value: 2834.852518217115.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000410 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1050
[LightGBM] [Info] Number of data points in the train set: 5709, number of used features: 16
[LightGBM] [Info] Start training from score 7.399722


Optimizing:  90%|█████████ | 45/50 [01:22<00:02,  2.24it/s][I 2025-02-03 00:34:38,760] Trial 44 finished with value: 2834.8582079876082 and parameters: {'model': 'LightGBM', 'n_estimators': 241, 'learning_rate': 0.19602261285701206, 'num_leaves': 37, 'max_depth': 14}. Best is trial 43 with value: 2834.852518217115.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000499 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1050
[LightGBM] [Info] Number of data points in the train set: 5709, number of used features: 16
[LightGBM] [Info] Start training from score 7.399722


Optimizing:  92%|█████████▏| 46/50 [01:22<00:01,  2.25it/s][I 2025-02-03 00:34:39,199] Trial 45 finished with value: 2834.856538419316 and parameters: {'model': 'LightGBM', 'n_estimators': 259, 'learning_rate': 0.16754270181840847, 'num_leaves': 38, 'max_depth': 8}. Best is trial 43 with value: 2834.852518217115.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000483 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1050
[LightGBM] [Info] Number of data points in the train set: 5709, number of used features: 16
[LightGBM] [Info] Start training from score 7.399722


Optimizing:  94%|█████████▍| 47/50 [01:23<00:01,  2.39it/s][I 2025-02-03 00:34:39,559] Trial 46 finished with value: 2834.859849185566 and parameters: {'model': 'LightGBM', 'n_estimators': 205, 'learning_rate': 0.1658338291541358, 'num_leaves': 34, 'max_depth': 17}. Best is trial 43 with value: 2834.852518217115.
Optimizing:  96%|█████████▌| 48/50 [01:24<00:01,  1.67it/s][I 2025-02-03 00:34:40,578] Trial 47 finished with value: 2834.859933336365 and parameters: {'model': 'XGBoost', 'n_estimators': 254, 'max_depth': 8, 'learning_rate': 0.13772288278805922, 'subsample': 0.7029063926405527}. Best is trial 43 with value: 2834.852518217115.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000245 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1050
[LightGBM] [Info] Number of data points in the train set: 5709, number of used features: 16
[LightGBM] [Info] Start training from score 7.399722


Optimizing:  98%|█████████▊| 49/50 [01:24<00:00,  1.87it/s][I 2025-02-03 00:34:40,966] Trial 48 finished with value: 2834.857319950315 and parameters: {'model': 'LightGBM', 'n_estimators': 186, 'learning_rate': 0.19110115278291634, 'num_leaves': 38, 'max_depth': 11}. Best is trial 43 with value: 2834.852518217115.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000202 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1050
[LightGBM] [Info] Number of data points in the train set: 5709, number of used features: 16
[LightGBM] [Info] Start training from score 7.399722


Optimizing: 100%|██████████| 50/50 [01:24<00:00,  2.07it/s][I 2025-02-03 00:34:41,326] Trial 49 finished with value: 2834.8575868508733 and parameters: {'model': 'LightGBM', 'n_estimators': 270, 'learning_rate': 0.16124675569225003, 'num_leaves': 28, 'max_depth': 13}. Best is trial 43 with value: 2834.852518217115.
Optimizing: 100%|██████████| 50/50 [01:24<00:00,  1.70s/it]


Unnamed: 0,Model,Best Params,RMSE
0,CatBoost,"{'model': 'CatBoost', 'iterations': 187, 'dept...",2834.890655
1,RandomForest,"{'model': 'RandomForest', 'n_estimators': 316,...",2834.85876
2,RandomForest,"{'model': 'RandomForest', 'n_estimators': 231,...",2834.858118
3,LightGBM,"{'model': 'LightGBM', 'n_estimators': 198, 'le...",2834.860007
4,ExtraTrees,"{'model': 'ExtraTrees', 'n_estimators': 260, '...",2834.858151
5,XGBoost,"{'model': 'XGBoost', 'n_estimators': 140, 'max...",2834.858513
6,XGBoost,"{'model': 'XGBoost', 'n_estimators': 113, 'max...",2834.859857
7,LightGBM,"{'model': 'LightGBM', 'n_estimators': 211, 'le...",2834.859376
8,LightGBM,"{'model': 'LightGBM', 'n_estimators': 177, 'le...",2834.858914
9,RandomForest,"{'model': 'RandomForest', 'n_estimators': 321,...",2834.860979


In [30]:
df_results_optuna.to_csv('optuna_results.csv', index=False)
df_results_optuna

Unnamed: 0,Model,Best Params,RMSE
0,CatBoost,"{'model': 'CatBoost', 'iterations': 187, 'dept...",2834.890655
1,RandomForest,"{'model': 'RandomForest', 'n_estimators': 316,...",2834.85876
2,RandomForest,"{'model': 'RandomForest', 'n_estimators': 231,...",2834.858118
3,LightGBM,"{'model': 'LightGBM', 'n_estimators': 198, 'le...",2834.860007
4,ExtraTrees,"{'model': 'ExtraTrees', 'n_estimators': 260, '...",2834.858151
5,XGBoost,"{'model': 'XGBoost', 'n_estimators': 140, 'max...",2834.858513
6,XGBoost,"{'model': 'XGBoost', 'n_estimators': 113, 'max...",2834.859857
7,LightGBM,"{'model': 'LightGBM', 'n_estimators': 211, 'le...",2834.859376
8,LightGBM,"{'model': 'LightGBM', 'n_estimators': 177, 'le...",2834.858914
9,RandomForest,"{'model': 'RandomForest', 'n_estimators': 321,...",2834.860979


Based on the results, CatBoost appears to be the best model as it consistently achieves the lowest RMSE across different parameter tuning methods. The best-performing parameters from the random grid search and TPESampler are:

Best CatBoost Model Parameters (from TPESampler)

In [46]:
# 70-30
# Define features and target
X = training_data.drop(columns=['Item_Outlet_Sales', 'Item_Identifier'])  # Drop unnecessary columns
y = training_data['Item_Outlet_Sales']

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Best CatBoost parameters from TPESampler
best_params = {
    "iterations": 296,
    "depth": 7,
    "learning_rate": 0.021573131062485866,
    "l2_leaf_reg": 4
}

# Initialize and train the CatBoost model
catboost_model = CatBoostRegressor(**best_params, verbose=0)
catboost_model.fit(X_train, y_train)

# Make predictions
y_pred = catboost_model.predict(X_test)

# Evaluate the model
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Print RMSE
print(f"CatBoost Model RMSE: {rmse:.4f}")

# Reverse log transformation (convert back to original scale)
y_test = np.expm1(y_test)

# Reverse log transformation (convert back to original scale)
y_pred = np.expm1(y_pred)

# Evaluate the model
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Print RMSE
print(f"CatBoost Model RMSE: {rmse:.4f}")

CatBoost Model RMSE: 0.5162
CatBoost Model RMSE: 1141.0726


In [45]:
# 80-20
# Define features and target
X = training_data.drop(columns=['Item_Outlet_Sales', 'Item_Identifier'])  # Drop unnecessary columns
y = training_data['Item_Outlet_Sales']

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Best CatBoost parameters from TPESampler
best_params = {
    "learning_rate": 0.05,
    "l2_leaf_reg": 3,
    "iterations": 200,
    "depth": 4
}

# Initialize and train the CatBoost model
catboost_model = CatBoostRegressor(**best_params, verbose=0)
catboost_model.fit(X_train, y_train)

# Make predictions
y_pred = catboost_model.predict(X_test)

# Evaluate the model
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Print RMSE
print(f"CatBoost Model RMSE: {rmse:.4f}")

# Reverse log transformation (convert back to original scale)
y_test = np.expm1(y_test)

# Reverse log transformation (convert back to original scale)
y_pred = np.expm1(y_pred)

# Evaluate the model
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Print RMSE
print(f"CatBoost Model RMSE: {rmse:.4f}")

CatBoost Model RMSE: 0.5233
CatBoost Model RMSE: 1135.9065
