In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from preprocessing_utils import (
    identify_column_types,
    standardize_features,
    fit_impute_item_weight,
    transform_impute_item_weight,
    fit_mice_imputation,
    transform_mice_imputation,
    create_features,
    detect_and_remove_outliers,
    transform_features,
    fit_scale_numerical_features,
    transform_scale_numerical_features,
    fit_encode_categorical_features,
    transform_encode_categorical_features
)


print("Loading datasets...")
train_df = pd.read_csv('train_v9rqX0R.csv')
test_df = pd.read_csv('test_AbJTz2l.csv')

print("Processing training set...")
# Step 1: Initial preprocessing
numerical_cols, categorical_cols = identify_column_types(train_df)
train_df = standardize_features(train_df)

print(train_df.shape)
original_columns = set(train_df.columns.to_list())



Loading datasets...
Processing training set...
(8523, 13)


In [2]:

# Step 2: Weight imputation
train_df, item_weights = fit_impute_item_weight(train_df)



# Identify missing columns
missing_columns = original_columns - set(train_df.columns.to_list())
print(train_df.shape)
print("Missing columns:", missing_columns)


(8523, 13)
Missing columns: set()


In [3]:
# Step 3: MICE imputation
selected_columns = [
    "Item_Identifier", "Item_Weight", "Item_Fat_Content", "Item_Visibility",
    "Item_Type", "Item_MRP", "Outlet_Identifier", "Outlet_Establishment_Year",
    "Outlet_Size", "Outlet_Location_Type", "Outlet_Type","Item_Type_Grouped"]
categorical_vars = selected_columns
train_df, mice_mappings = fit_mice_imputation(train_df, selected_columns, categorical_vars)

# Identify missing columns
missing_columns = original_columns - set(train_df.columns.to_list())
print(train_df.shape)
print("Missing columns:", missing_columns)



(8523, 13)
Missing columns: set()


In [4]:
# Step 4: Feature engineering
train_df = create_features(train_df)
train_df = transform_features(train_df)


# Identify missing columns
missing_columns = original_columns - set(train_df.columns.to_list())
print(train_df.shape)
print("Missing columns:", missing_columns)


(8523, 18)
Missing columns: set()


In [5]:
# Step 5: Outlier detection and removal
numerical_cols, categorical_cols= identify_column_types(train_df)
train_df = detect_and_remove_outliers(train_df, test_df, numerical_cols)

# Identify missing columns
missing_columns = original_columns - set(train_df.columns.to_list())
print(train_df.shape)
print("Missing columns:", missing_columns)

(8157, 18)
Missing columns: set()


In [6]:
# Step 6: Scaling
numerical_cols, categorical_cols= identify_column_types(train_df)
train_df, scaler = fit_scale_numerical_features(train_df, numerical_cols)

# Identify missing columns
missing_columns = original_columns - set(train_df.columns.to_list())
print(train_df.shape)
print("Missing columns:", missing_columns)

(8157, 18)
Missing columns: set()


In [7]:
# Step 7: Encoding
train_df, encoder = fit_encode_categorical_features(train_df)

# Identify missing columns
missing_columns = original_columns - set(train_df.columns.to_list())
print(train_df.shape)
print("Missing columns:", missing_columns)

(8157, 18)
Missing columns: set()


In [8]:
train_df.columns.to_list()

['Item_Identifier',
 'Item_Weight',
 'Item_Fat_Content',
 'Item_Visibility',
 'Item_Type',
 'Item_MRP',
 'Outlet_Identifier',
 'Outlet_Establishment_Year',
 'Outlet_Size',
 'Outlet_Location_Type',
 'Outlet_Type',
 'Item_Outlet_Sales',
 'Item_Type_Grouped',
 'Outlet_Age',
 'Establishment_Decade',
 'Outlet_Age_Category',
 'Price_per_Unit_Weight',
 'Item_MRP_Binned']

In [9]:
print("\nProcessing test set...")
# Process test set using fitted transformations but without outlier removal
test_df = standardize_features(test_df)
test_df = transform_impute_item_weight(test_df, item_weights)
numerical_cols, categorical_cols = identify_column_types(test_df)
test_df = transform_mice_imputation(test_df, selected_columns, categorical_vars, mice_mappings)
test_df = create_features(test_df)
test_df = transform_features(test_df)
numerical_cols, categorical_cols = identify_column_types(test_df)
test_df = transform_scale_numerical_features(test_df, numerical_cols, scaler)
test_df = transform_encode_categorical_features(test_df, encoder)

print("\nSaving processed datasets...")
train_df.to_csv('processed_train.csv', index=False)
test_df.to_csv('processed_test.csv', index=False)
print("Done! Processed datasets saved as 'processed_train.csv' and 'processed_test.csv'")


Processing test set...

Saving processed datasets...
Done! Processed datasets saved as 'processed_train.csv' and 'processed_test.csv'


In [3]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_squared_error # compute RMSE by utilizing the mean_squared_error function with the squared parameter set to False
from optuna.samplers import TPESampler
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")


# =============================================================================
# Model Training and Evaluation
# =============================================================================

training_data = pd.read_csv('processed_train.csv')
testing_data = pd.read_csv('processed_test.csv')


print(training_data.columns.tolist())

# Define features and target
X = training_data.drop(columns=['Item_Outlet_Sales','Item_Identifier'])
y = training_data['Item_Outlet_Sales']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)



['Item_Identifier', 'Item_Weight', 'Item_Fat_Content', 'Item_Visibility', 'Item_Type', 'Item_MRP', 'Outlet_Identifier', 'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type', 'Item_Outlet_Sales', 'Item_Type_Grouped', 'Outlet_Age', 'Establishment_Decade', 'Outlet_Age_Category', 'Price_per_Unit_Weight', 'Item_MRP_Binned']


In [1]:
import sklearn
import xgboost as xgb
import lightgbm as lgb
import catboost

print("Scikit-Learn Version:", sklearn.__version__)
print("XGBoost Version:", xgb.__version__)
print("LightGBM Version:", lgb.__version__)
print("CatBoost Version:", catboost.__version__)


Scikit-Learn Version: 1.5.2
XGBoost Version: 2.1.3
LightGBM Version: 4.5.0
CatBoost Version: 1.2.7


In [4]:
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import RandomizedSearchCV
from tqdm import tqdm
import numpy as np

# Define the models
models = {
    "RandomForest": RandomForestRegressor(),
    "XGBoost": xgb.XGBRegressor(objective="reg:squarederror"),
    "LightGBM": lgb.LGBMRegressor(),
    "CatBoost": CatBoostRegressor(verbose=0),
    "ExtraTrees": ExtraTreesRegressor()
}

# Define hyperparameter grids
param_grids = {
    "RandomForest": {
        "n_estimators": [50, 100, 200, 500],
        "max_depth": [None, 10, 20, 30],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4]
    },
    "XGBoost": {
        "n_estimators": [100, 200, 300],
        "max_depth": [3, 6, 9],
        "learning_rate": [0.01, 0.1, 0.2],
        "subsample": [0.7, 0.8, 1.0]
    },
    "LightGBM": {
        "n_estimators": [100, 200, 300],
        "learning_rate": [0.01, 0.1, 0.2],
        "num_leaves": [20, 30, 40],
        "max_depth": [-1, 10, 20]
    },
    "CatBoost": {
        "iterations": [100, 200, 300],
        "depth": [4, 6, 10],
        "learning_rate": [0.01, 0.05, 0.1],
        "l2_leaf_reg": [3, 5, 7]
    },
    "ExtraTrees": {
        "n_estimators": [50, 100, 200, 500],
        "max_depth": [None, 10, 20, 30],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4]
    }
}

# Store results in DataFrame
results = []

for model_name in tqdm(models, desc="Running RandomizedSearchCV"):
    print(f"\nRunning RandomizedSearchCV for {model_name}...")
    search = RandomizedSearchCV(
        models[model_name], 
        param_distributions=param_grids[model_name], 
        n_iter=20, 
        cv=3, 
        scoring="neg_mean_squared_error", 
        random_state=42, 
        n_jobs=-1
    )
    search.fit(X_train, y_train)
    
    best_model = search.best_estimator_
    best_params = search.best_params_
    preds = best_model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))

    results.append({"Model": model_name, "Best Params": best_params, "RMSE": rmse})

# Convert results to DataFrame and save
df_results_random = pd.DataFrame(results)


df_results_random


Running RandomizedSearchCV:   0%|          | 0/5 [00:00<?, ?it/s]


Running RandomizedSearchCV for RandomForest...


Running RandomizedSearchCV:  20%|██        | 1/5 [00:21<01:24, 21.12s/it]


Running RandomizedSearchCV for XGBoost...


Running RandomizedSearchCV:  40%|████      | 2/5 [00:22<00:29,  9.68s/it]


Running RandomizedSearchCV for LightGBM...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004480 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1049
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004333 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004307 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1048
[LightGBM] [Info] Total Bins 1048
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004401 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory 

Running RandomizedSearchCV:  60%|██████    | 3/5 [01:13<00:57, 28.51s/it]


Running RandomizedSearchCV for CatBoost...


Running RandomizedSearchCV:  80%|████████  | 4/5 [01:18<00:19, 19.31s/it]


Running RandomizedSearchCV for ExtraTrees...


Running RandomizedSearchCV: 100%|██████████| 5/5 [01:26<00:00, 17.29s/it]


RandomizedSearchCV Results Saved: 'random_search_results.csv'





Unnamed: 0,Model,Best Params,RMSE
0,RandomForest,"{'n_estimators': 100, 'min_samples_split': 10,...",0.526283
1,XGBoost,"{'subsample': 0.8, 'n_estimators': 100, 'max_d...",0.520316
2,LightGBM,"{'num_leaves': 20, 'n_estimators': 200, 'max_d...",0.528208
3,CatBoost,"{'learning_rate': 0.05, 'l2_leaf_reg': 3, 'ite...",0.516475
4,ExtraTrees,"{'n_estimators': 50, 'min_samples_split': 5, '...",0.522938


In [5]:
display(df_results_random)

Unnamed: 0,Model,Best Params,RMSE
0,RandomForest,"{'n_estimators': 100, 'min_samples_split': 10,...",0.526283
1,XGBoost,"{'subsample': 0.8, 'n_estimators': 100, 'max_d...",0.520316
2,LightGBM,"{'num_leaves': 20, 'n_estimators': 200, 'max_d...",0.528208
3,CatBoost,"{'learning_rate': 0.05, 'l2_leaf_reg': 3, 'ite...",0.516475
4,ExtraTrees,"{'n_estimators': 50, 'min_samples_split': 5, '...",0.522938


In [None]:
import optuna
from tqdm import tqdm

# Store results
results_optuna = []

n_trials = 50
pbar = tqdm(total=n_trials, desc="Optimizing")

def objective(trial):
    model_name = trial.suggest_categorical("model", ["RandomForest", "XGBoost", "LightGBM", "CatBoost", "ExtraTrees"])
    
    if model_name == "RandomForest":
        model = RandomForestRegressor(
            n_estimators=trial.suggest_int("n_estimators", 50, 500),
            max_depth=trial.suggest_int("max_depth", 1, 30), 
            min_samples_split=trial.suggest_int("min_samples_split", 2, 10),
            min_samples_leaf=trial.suggest_int("min_samples_leaf", 1, 4),
            random_state=42
        )
    
    elif model_name == "XGBoost":
        model = xgb.XGBRegressor(
            n_estimators=trial.suggest_int("n_estimators", 100, 300),
            max_depth=trial.suggest_int("max_depth", 3, 9),  
            learning_rate=trial.suggest_float("learning_rate", 0.01, 0.2),
            subsample=trial.suggest_float("subsample", 0.7, 1.0),
            objective="reg:squarederror",
            random_state=42
        )
    
    elif model_name == "LightGBM":
        model = lgb.LGBMRegressor(
            n_estimators=trial.suggest_int("n_estimators", 100, 300),
            learning_rate=trial.suggest_float("learning_rate", 0.01, 0.2),
            num_leaves=trial.suggest_int("num_leaves", 20, 40),
            max_depth=trial.suggest_int("max_depth", -1, 20),  
            random_state=42
        )
    
    elif model_name == "CatBoost":
        model = CatBoostRegressor(
            iterations=trial.suggest_int("iterations", 100, 300),
            depth=trial.suggest_int("depth", 4, 10),
            learning_rate=trial.suggest_float("learning_rate", 0.01, 0.1),
            l2_leaf_reg=trial.suggest_int("l2_leaf_reg", 3, 7),
            random_state=42,
            verbose=0
        )
    
    elif model_name == "ExtraTrees":
        model = ExtraTreesRegressor(
            n_estimators=trial.suggest_int("n_estimators", 50, 500),
            max_depth=trial.suggest_int("max_depth", 1, 30), 
            min_samples_split=trial.suggest_int("min_samples_split", 2, 10),
            min_samples_leaf=trial.suggest_int("min_samples_leaf", 1, 4),
            random_state=42
        )

    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))

    results_optuna.append({"Model": model_name, "Best Params": trial.params, "RMSE": rmse})
    pbar.update(1)

    return rmse

study = optuna.create_study(direction="minimize", sampler=TPESampler())
study.optimize(objective, n_trials=n_trials)

pbar.close()

# Convert results to DataFrame and save
df_results_optuna = pd.DataFrame(results_optuna)

df_results_optuna


In [10]:
df_results_optuna

Unnamed: 0,Model,Best Params,RMSE
0,CatBoost,"{'model': 'CatBoost', 'iterations': 215, 'dept...",0.520592
1,RandomForest,"{'model': 'RandomForest', 'n_estimators': 329,...",0.530089
2,LightGBM,"{'model': 'LightGBM', 'n_estimators': 141, 'le...",0.540498
3,LightGBM,"{'model': 'LightGBM', 'n_estimators': 270, 'le...",0.536452
4,RandomForest,"{'model': 'RandomForest', 'n_estimators': 290,...",0.53355
5,LightGBM,"{'model': 'LightGBM', 'n_estimators': 208, 'le...",0.563068
6,RandomForest,"{'model': 'RandomForest', 'n_estimators': 450,...",0.535583
7,XGBoost,"{'model': 'XGBoost', 'n_estimators': 250, 'max...",0.544326
8,LightGBM,"{'model': 'LightGBM', 'n_estimators': 173, 'le...",0.536043
9,CatBoost,"{'model': 'CatBoost', 'iterations': 164, 'dept...",0.518198


Based on the results, CatBoost appears to be the best model as it consistently achieves the lowest RMSE across different parameter tuning methods. The best-performing parameters from the random grid search and TPESampler are:

Best CatBoost Model Parameters (from TPESampler)

In [21]:
# Define features and target
X = training_data.drop(columns=['Item_Outlet_Sales', 'Item_Identifier'])  # Drop unnecessary columns
y = training_data['Item_Outlet_Sales']

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Best CatBoost parameters from TPESampler
best_params = {
    "iterations": 296,
    "depth": 7,
    "learning_rate": 0.021573131062485866,
    "l2_leaf_reg": 4
}

# Initialize and train the CatBoost model
catboost_model = CatBoostRegressor(**best_params, verbose=0)
catboost_model.fit(X_train, y_train)

# Make predictions
y_pred = catboost_model.predict(X_test)

# Evaluate the model
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Print RMSE
print(f"CatBoost Model RMSE: {rmse:.4f}")

CatBoost Model RMSE: 0.5162


In [22]:
# Reverse log transformation (convert back to original scale)
y_test = np.expm1(y_test)

# Reverse log transformation (convert back to original scale)
y_pred = np.expm1(y_pred)

# Evaluate the model
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Print RMSE
print(f"CatBoost Model RMSE: {rmse:.4f}")

CatBoost Model RMSE: 1141.0726
