In [1]:
!pip install catboost

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: C:\Users\soumy\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [2]:
!pip install -q lightgbm


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: C:\Users\soumy\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [3]:
!pip install -q xgboost


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: C:\Users\soumy\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

from catboost import CatBoostRegressor


In [5]:
DATA_PATH = "feature_engineered_master.csv"
df = pd.read_csv(DATA_PATH)

print(df.shape)
df.head()


(3854, 18)


Unnamed: 0,mall_id,block_id,block_type,store_code,store_name,retailer_code,bl1_label,bl2_label,bl3_label,gla,gla_category,cur_code,sales_r12m,total_costs_r12m,sales_per_cost,has_financials,sri_score,has_sri
0,33,6990,CELL,1024429,,478,Food Stores & Mass Merchandise,Hypermarkets,Hypermarkets,5125.06,LARGE UNITS,PLN,3925410.561,275000.7278,14.274182,1.0,,
1,16,1669,CELL,1313684,,137432,Services,Other product-related services,Others Products related services,116.0,SMALL UNITS,EUR,33232.848,,,1.0,,
2,26,6436,CELL,1001163,,478,Food Stores & Mass Merchandise,Hypermarkets,Hypermarkets,15374.0,LARGE UNITS,,,,,,,
3,10,4653,CELL,1312660,,108125,Culture & Media & Technology,"Books, Music & Multimedia",Bookstore & stationery,85.0,SMALL UNITS,CZK,87862.674,20984.35704,4.187056,1.0,,
4,16,1642,KIOSK,1314473,,55270,Food Stores & Mass Merchandise,Frozen food,Frozen food,,,,,,,,,


Defines the target (`sales_r12m`), drops rows missing it, and creates `log_sales = log1p(sales)` (log scale stabilizes skew); `y` is what models learn.

In [6]:
TARGET = "sales_r12m"

# Drop rows with missing target
df = df.dropna(subset=[TARGET])

# Log-transform target (recommended)
df["log_sales"] = np.log1p(df[TARGET])
y = df["log_sales"]

print("Rows after cleaning:", df.shape[0])


Rows after cleaning: 3609


Builds the feature matrix `X` by dropping identifiers/leak-prone columns, and sets `groups = mall_id` so validation splits don’t mix the same mall across train/val.


In [7]:
DROP_COLS = [
    TARGET,
    "log_sales",
    "store_code",
    "block_id",
    "store_name"
]

X = df.drop(columns=[c for c in DROP_COLS if c in df.columns])
groups = df["mall_id"]  # group-aware split


Splits features into categorical vs numeric columns (based on dtype) so we can preprocess them differently; prints lists to sanity-check feature typing.

In [8]:
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numeric_cols = X.select_dtypes(exclude=["object"]).columns.tolist()

print("Categorical features:", categorical_cols)
print("Numeric features:", numeric_cols)


Categorical features: ['block_type', 'bl1_label', 'bl2_label', 'bl3_label', 'gla_category', 'cur_code']
Numeric features: ['mall_id', 'retailer_code', 'gla', 'total_costs_r12m', 'sales_per_cost', 'has_financials', 'sri_score', 'has_sri']


Creates a 5-fold GroupKFold split and takes the first fold: train/val are separated by `mall_id` (interpretation: val simulates “new malls”).


In [9]:
gkf = GroupKFold(n_splits=5)

train_idx, val_idx = next(gkf.split(X, y, groups))

X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]


Trains an Elastic Net baseline with full preprocessing (median impute + scale numeric, most-frequent impute + one-hot categorical) and outputs (RMSE, R²) on log-sales: lower RMSE and higher R² are better.


In [10]:
# Identify columns again (safe)
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numeric_cols = X.select_dtypes(exclude=["object"]).columns.tolist()

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols),
    ],
    remainder="drop"
)

elastic_net = Pipeline(steps=[
    ("prep", preprocessor),
    ("model", ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42))
])

elastic_net.fit(X_train, y_train)
pred_en = elastic_net.predict(X_val)

rmse_en = np.sqrt(mean_squared_error(y_val, pred_en))
r2_en = r2_score(y_val, pred_en)

rmse_en, r2_en


(np.float64(0.9599346088201473), 0.43742864901528744)

Trains a Random Forest on *numeric-only* features (median-imputed), then evaluates on validation; RMSE/R² interpret the same way as above (but note: this ignores categorical variables entirely).


In [11]:
from sklearn.impute import SimpleImputer

num_imputer = SimpleImputer(strategy="median")

X_train_num = num_imputer.fit_transform(X_train[numeric_cols])
X_val_num = num_imputer.transform(X_val[numeric_cols])

rf = RandomForestRegressor(
    n_estimators=300,
    max_depth=12,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train_num, y_train)
pred_rf = rf.predict(X_val_num)

rmse_rf = np.sqrt(mean_squared_error(y_val, pred_rf))
r2_rf = r2_score(y_val, pred_rf)
rmse_rf, r2_rf


(np.float64(0.6008661580551038), 0.7795806074132077)

Cleans categorical columns for CatBoost by converting them to strings and replacing missing values with "missing" (interpretation: ensures CatBoost won’t crash on NaNs in categoricals).

In [12]:
# Make sure we're not working on views
X_train = X_train.copy()
X_val = X_val.copy()

# Recompute categorical cols robustly: treat low-cardinality ints as categoricals if needed
categorical_cols = X_train.select_dtypes(include=["object", "category"]).columns.tolist()

# Force ALL categorical cols to string and fill NaNs
for col in categorical_cols:
    X_train[col] = X_train[col].astype(str).fillna("missing")
    X_val[col] = X_val[col].astype(str).fillna("missing")

# EXTRA SAFETY: replace any remaining NaN anywhere in categorical cols (covers weird cases)
X_train[categorical_cols] = X_train[categorical_cols].replace({np.nan: "missing"})
X_val[categorical_cols] = X_val[categorical_cols].replace({np.nan: "missing"})

# Sanity check: if this prints >0, something is still wrong
print("NaNs in categorical (train):", X_train[categorical_cols].isna().sum().sum())
print("NaNs in categorical (val):", X_val[categorical_cols].isna().sum().sum())

NaNs in categorical (train): 0
NaNs in categorical (val): 0


Fits CatBoost on the mixed feature set (numeric + categorical indices) and reports RMSE/R² on log-sales; if CatBoost RMSE is lowest, it’s your best model so far.


In [13]:
cat_features_idx = [X_train.columns.get_loc(col) for col in categorical_cols]

cat = CatBoostRegressor(
    iterations=800,
    depth=8,
    learning_rate=0.05,
    loss_function="RMSE",
    random_seed=42,
    verbose=False
)

cat.fit(X_train, y_train, cat_features=cat_features_idx)

pred_cat = cat.predict(X_val)

rmse_cat = np.sqrt(mean_squared_error(y_val, pred_cat))
r2_cat = r2_score(y_val, pred_cat)

rmse_cat, r2_cat


(np.float64(0.560312053557527), 0.8083299558511131)

### LightGBM (Gradient Boosting Trees)

Trains a LightGBM regressor on the same train/validation split. We let LightGBM handle categorical features by converting them to pandas `category` dtype.


In [14]:
import lightgbm as lgb

# Copy to avoid side effects
X_train_lgb = X_train.copy()
X_val_lgb = X_val.copy()

# Make categorical cols category dtype for LightGBM
for col in categorical_cols:
    X_train_lgb[col] = X_train_lgb[col].astype('category')
    X_val_lgb[col] = X_val_lgb[col].astype('category')

lgbm = lgb.LGBMRegressor(
    n_estimators=5000,
    learning_rate=0.03,
    num_leaves=63,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

lgbm.fit(
    X_train_lgb, y_train,
    eval_set=[(X_val_lgb, y_val)],
    eval_metric='rmse',
    callbacks=[lgb.early_stopping(stopping_rounds=200, verbose=False)]
)

pred_lgb = lgbm.predict(X_val_lgb, num_iteration=lgbm.best_iteration_)
rmse_lgb = np.sqrt(mean_squared_error(y_val, pred_lgb))
r2_lgb = r2_score(y_val, pred_lgb)

rmse_lgb, r2_lgb

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000657 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1458
[LightGBM] [Info] Number of data points in the train set: 2881, number of used features: 13
[LightGBM] [Info] Start training from score 11.873930


(np.float64(0.530858104625662), 0.8279513707968207)

### HistGradientBoostingRegressor (Sklearn)

A strong, fast baseline that works well on tabular data. Uses one-hot encoding for categoricals via a preprocessing pipeline.


In [15]:
# from sklearn.ensemble import HistGradientBoostingRegressor

# # Reuse the same preprocessor pattern (impute + one-hot)
# hgb = Pipeline(steps=[
#     ('prep', preprocessor),
#     ('model', HistGradientBoostingRegressor(
#         learning_rate=0.05,
#         max_depth=8,
#         max_iter=500,
#         random_state=42
#     ))
# ])

# hgb.fit(X_train, y_train)
# pred_hgb = hgb.predict(X_val)

# rmse_hgb = np.sqrt(mean_squared_error(y_val, pred_hgb))
# r2_hgb = r2_score(y_val, pred_hgb)

# rmse_hgb, r2_hgb

Creates a comparison table across Elastic Net / Random Forest / CatBoost and sorts by RMSE (interpretation: top row = best on log-scale RMSE).


In [16]:
import xgboost as xgb

# XGBoost with the same preprocessing (impute + one-hot) as other baselines
xgb_model = Pipeline(steps=[
    ('prep', preprocessor),
    ('model', xgb.XGBRegressor(
        objective='reg:squarederror',
        n_estimators=100,
        learning_rate=0.01,
        max_depth=5,
        subsample=0.7,
        colsample_bytree=0.7,
        reg_alpha=0.0,
        reg_lambda=1.0,
        random_state=42,
        n_jobs=-1,
        tree_method='hist'
    ))
])

# NOTE: sklearn Pipeline doesn't expose early stopping cleanly without splitting preprocessed arrays.
# For consistency with the notebook structure, we run without early stopping here.
# (If you want early stopping, I can give a small helper to pre-transform and pass eval_set.)

xgb_model.fit(X_train, y_train)
pred_xgb = xgb_model.predict(X_val)

rmse_xgb = np.sqrt(mean_squared_error(y_val, pred_xgb))
r2_xgb = r2_score(y_val, pred_xgb)

rmse_xgb, r2_xgb

(np.float64(0.8179268185826872), 0.5915647573125506)

In [17]:
results = pd.DataFrame({
    'Model': ['Elastic Net', 'Random Forest', 'CatBoost', 'LightGBM', 'XGBoost'],
    'RMSE (log sales)': [rmse_en, rmse_rf, rmse_cat, rmse_lgb, rmse_xgb],
    'R2': [r2_en, r2_rf, r2_cat, r2_lgb, r2_xgb]
})

results.sort_values('RMSE (log sales)')


Unnamed: 0,Model,RMSE (log sales),R2
3,LightGBM,0.530858,0.827951
2,CatBoost,0.560312,0.80833
1,Random Forest,0.600866,0.779581
4,XGBoost,0.817927,0.591565
0,Elastic Net,0.959935,0.437429


Converts CatBoost predictions back to euros using `expm1` and computes RMSE in original sales units (interpretation: this is the “business-readable” error magnitude).


In [18]:
pred_cat_eur = np.expm1(pred_cat)
y_val_eur = np.expm1(y_val)

rmse_eur = np.sqrt(mean_squared_error(y_val_eur, pred_cat_eur))
rmse_eur


np.float64(541434.6385416274)

Shows summary stats of actual validation sales in euros (interpretation: helps judge whether the euro-RMSE is large or small relative to typical sales levels).


In [19]:
y_val_eur.describe()


count    7.280000e+02
mean     3.304868e+05
std      8.961972e+05
min      1.139544e+03
25%      6.094097e+04
50%      1.092759e+05
75%      2.745195e+05
max      1.509674e+07
Name: log_sales, dtype: float64

Computes MAE in euros (interpretation: average absolute error; often easier to explain than RMSE and less sensitive to huge outliers).


In [20]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_val_eur, pred_cat_eur)

np.float64(100750.36491764447)

Computes mean absolute error by `gla_category` (interpretation: tells you which store-size segment the model struggles with most—higher = worse).


In [21]:
# Assume you have gla or gla_category
df_val = X_val.copy()
df_val["actual"] = y_val_eur
df_val["pred"] = pred_cat_eur

df_val["abs_error"] = np.abs(df_val["actual"] - df_val["pred"])

df_val.groupby("gla_category")["abs_error"].mean()

gla_category
LARGE UNITS    1.066235e+06
MSU            2.770588e+05
SMALL UNITS    3.337651e+04
nan            1.785308e+04
Name: abs_error, dtype: float64

Aggregates predictions vs actuals at the mall level (sum of store sales per mall) and summarizes absolute error distribution (interpretation: shows where the model is most wrong by mall).


In [22]:
df_val["mall_id"] = groups.iloc[val_idx].values

mall_error = (
    df_val
    .groupby("mall_id")[["actual", "pred"]]
    .sum()
)

mall_error["abs_error"] = np.abs(
    mall_error["actual"] - mall_error["pred"]
)

mall_error["abs_error"].describe()

count    4.000000e+00
mean     1.105177e+07
std      1.210606e+07
min      9.578605e+05
25%      4.848634e+06
50%      7.333593e+06
75%      1.353673e+07
max      2.858204e+07
Name: abs_error, dtype: float64