In [37]:
import os

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


csv_path = os.path.join('..', 'data', 'train_v9rqX0R.csv')
data = pd.read_csv(csv_path)
# load the dataset

In [38]:
test_csv_path = os.path.join('..', 'data', 'test_AbJTz2l.csv')
test_data = pd.read_csv(test_csv_path)

In [64]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge
import xgboost as xgb
from sklearn.preprocessing import QuantileTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
random_forest_model = RandomForestRegressor(
        n_estimators=400,
        max_depth=10,
        min_samples_split=5,
        min_samples_leaf=2,
        max_features='sqrt',
    )
linear_model = Ridge(alpha=0.32, solver='cholesky')
boost_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=461, learning_rate=0.02, max_depth=3)

quant_transformer = QuantileTransformer(output_distribution='normal')
visib_quant_transformer = QuantileTransformer(output_distribution='normal')

idxs = np.random.shuffle(np.linspace(0,len(data),len(data)))

In [4]:
MIN_MAX = {
    "Item_Weight" : {"min":0,"max":25},
    "Item_Visibility" : {"min":0,"max":0.2},
    "Item_MRP": {"min":30,"max":270},
    "Item_Outlet_Sales": {"min":0,"max":14000},
    "Outlet_Establishment_Year": {"min":1985,"max":2009}, 
}
def normalize(data, column, clip = False):
    if clip:
        data[column][data[column] > MIN_MAX[column]['max']] = MIN_MAX[column]['max']
        data[column][data[column] < MIN_MAX[column]['min']] = MIN_MAX[column]['min']
    normalized_col = (data[column] - MIN_MAX[column]['min']) / (MIN_MAX[column]['max'] - MIN_MAX[column]['min'])
    return normalized_col.values.reshape(-1,1)
# create normalize function for numerical values


In [5]:
from sklearn.preprocessing import OneHotEncoder

def extract_tier_level(data):
    data["Outlet_Location_Type"] = data["Outlet_Location_Type"].str[-1]
    return data["Outlet_Location_Type"].values.reshape(-1,1).astype(np.int64)

def onehotencode(data,column):
    encoder = OneHotEncoder(sparse_output=False)
    return encoder.fit_transform(data[column].values.reshape(-1,1))

In [6]:
def extract_supermarket_level(data):
    data["Outlet_Type"] = data["Outlet_Location_Type"].str[-1]
    data["Outlet_Type"][data["Outlet_Type"] == "e"] = 0
    return data["Outlet_Type"].values.astype(np.int64).reshape(-1,1)

In [51]:
FAT_MAP = {
           'Low Fat':0,
           'Regular':1,
           'low fat':0,
           'LF'     :0,
           'reg'   :1
          }
def map_fat(data):
    """#3 Handles preprocessing of column Item_Fat_Content

    Args:
        data (pd.DataFrame): dataset
    """    
    for key in FAT_MAP.keys():
        data["Item_Fat_Content"][data["Item_Fat_Content"] == key] = FAT_MAP[key]

In [40]:
def handle_weight_nan(data, drop=True):
    data['Item_Weight'] = data.groupby('Item_Identifier')['Item_Weight'].transform(
        lambda x: x.fillna(method='ffill').fillna(method='bfill'))
# only 4 are left. Drop them.
    if drop:
        data.dropna(subset=['Item_Weight'],inplace=True)


In [9]:
def handle_outlet_size_nan(data):
    data["Outlet_Size"][data["Outlet_Identifier"]=='OUT045'] = 'Small'
    data["Outlet_Size"][data["Outlet_Identifier"]=='OUT017'] = 'Small'
    data["Outlet_Size"][data["Outlet_Identifier"]=='OUT010'] = 'Small'

In [10]:
def set_outlet_size(data):
    data["Outlet_Size"][data["Outlet_Size"]=='Small'] = 1
    data["Outlet_Size"][data["Outlet_Size"]=='Medium'] = 2
    data["Outlet_Size"][data["Outlet_Size"]=='High'] = 3

In [11]:
def split_identifier(data):
    data["Item_Identifier_1"] = data["Item_Identifier"].str[0:2]
    data["Item_Identifier_2"] = data["Item_Identifier"].str[2:3]
    data["Item_Identifier_3"] = data["Item_Identifier"].str[3:]

In [124]:
processed_data = None
handle_weight_nan(data)
handle_outlet_size_nan(data)
split_identifier(data)
for column in ["Item_Visibility","Item_MRP"]:
    normalized_col = normalize(data, column=column, clip=True)
    if processed_data is None:
        processed_data = normalized_col
    else:
        processed_data = np.concat((processed_data, normalized_col),axis=1)
codes = extract_tier_level(data)
#codes = onehotencode(data,column="Outlet_Location_Type")
processed_data = np.concat((processed_data, codes),axis=1)
outlet_type = extract_supermarket_level(data)
#outlet_type = onehotencode(data,column="Outlet_Type")
processed_data = np.concat((processed_data, outlet_type),axis=1)
year = data["Outlet_Establishment_Year"].values.reshape(-1,1)
processed_data = np.concat((processed_data, year),axis=1)
map_fat(data)
fat = data["Item_Fat_Content"].values.reshape(-1,1)
#fat = onehotencode(data,column="Item_Fat_Content")
processed_data = np.concat((processed_data, fat),axis=1)
weight = data["Item_Weight"].values.reshape(-1,1)
processed_data = np.concat((processed_data, weight),axis=1)
#size = onehotencode(data,column="Outlet_Size")
set_outlet_size(data)
size = data["Outlet_Size"].values.reshape(-1,1)
processed_data = np.concat((processed_data, size),axis=1)
otlt_id = onehotencode(data,column="Outlet_Identifier")
processed_data = np.concat((processed_data, otlt_id),axis=1)
item_type = onehotencode(data,column="Item_Type")
processed_data = np.concat((processed_data, item_type),axis=1)
item_id1 = onehotencode(data,column="Item_Identifier_1")
processed_data = np.concat((processed_data, item_id1),axis=1)
item_id2 = onehotencode(data,column="Item_Identifier_2")
processed_data = np.concat((processed_data, item_id2),axis=1)
item_id3 = onehotencode(data,column="Item_Identifier_3")
processed_data = np.concat((processed_data, item_id3),axis=1)
X = processed_data
y = quant_transformer.fit_transform(data["Item_Outlet_Sales"].values.reshape(-1, 1))
#y = normalize(data, "Item_Outlet_Sales")
X[:] = X[idxs]
y[:] = y[idxs]
print(X.shape, y.shape)

  lambda x: x.fillna(method='ffill').fillna(method='bfill'))
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  data["Outlet_Size"][data["Outlet_Identifier"]=='OUT045'] = 'Small'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

(8519, 123) (8519, 1)


In [125]:
X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.2)
random_forest_model.fit(X_train,y_train)
y_val_predict = random_forest_model.predict(X_val)
val_metric = mean_squared_error(y_val, y_val_predict)
print(val_metric)
linear_model.fit(X_train,y_train)
y_val_predict = linear_model.predict(X_val)
val_metric = mean_squared_error(y_val, y_val_predict)
print(val_metric)

#0.01573721055840206  Item_Visibility
#0.013370745446789874 + Item_MRP
#0.013537161791957568 + Outlet_Location_Type 0.00952854902358565
#0.012630846804313844 + ohe Outlet_Location_Type 0.00973791777256838
#0.012783276198845138 + Outlet_Type 0.009433934667527642
#0.012848040532094557 + ohe Outlet_Type 0.009499097712716209
#0.007018652199454604 + Outlet_Establishment_Year 0.0.009411041734326737
#0.006749213477986866 + Item_Fat_Content 0.009353645105473936
#0.0074958593861559585 + ohe Item_Fat_Content 0.010607170411496225
#0.006109566201511222 + Item_Weight 0.009597284410110412
#0.006625616471489953 + ohe Outlet_Size 0.00882434285604162
#0.006669525973025804 + Outlet_Size 0.009529525099822588
#0.005896835430508409 + Outlet_Identifier 0.006027013829358424
#0.005705670970079423 +ohe Item_type 0.005889919400917727
#0.006441145499968069 +Item_Identifier 0.006264654587190418


  return fit_method(estimator, *args, **kwargs)


0.3255978039626589
0.30262177885000685


In [None]:
# based on above experimentation, it looks like not all the features are required.
# Lets try with feature that gave better delta
#0.013370745446789874 + Item_MRP
#0.007018652199454604 + Outlet_Establishment_Year 0.0.009411041734326737
#0.006109566201511222 + Item_Weight 0.009597284410110412
#0.005896835430508409 + Outlet_Identifier 0.006027013829358424
data['source'] = 'train'
test_data['source'] = 'test'
data = pd.concat([data, test_data], ignore_index=True)
handle_weight_nan(data,drop=False)
map_fat(data)
test_data = data[data['source'] == 'test'].drop('source', axis=1)
data = data[data['source'] == 'train'].drop('source', axis=1)



mrp = data["Item_MRP"].values.reshape(-1,1)
processed_data = mrp
year = data["Outlet_Establishment_Year"].values.reshape(-1,1)
processed_data = np.concat((processed_data, year),axis=1)
weight = data["Item_Weight"].values.reshape(-1,1)
processed_data = np.concat((processed_data, weight),axis=1)
otlt_id = onehotencode(data,column="Outlet_Identifier")
processed_data = np.concat((processed_data, otlt_id),axis=1)

#Try more features
visib = visib_quant_transformer.fit_transform(data["Item_Visibility"].values.reshape(-1, 1))
processed_data = np.concat((processed_data, visib),axis=1)

X = processed_data
y = quant_transformer.fit_transform(data["Item_Outlet_Sales"].values.reshape(-1, 1))
#y = normalize(data, "Item_Outlet_Sales")
X[:] = X[idxs]
y[:] = y[idxs]
print(X.shape, y.shape)

#random_forest_model.fit(X,y)
boost_model.fit(X,y)
#y_test_predict = random_forest_model.predict(X_test)
#linear_model.fit(X,y)

mrp = test_data["Item_MRP"].values.reshape(-1,1)
test_processed_data = mrp
year = test_data["Outlet_Establishment_Year"].values.reshape(-1,1)
test_processed_data = np.concat((test_processed_data, year),axis=1)
weight = test_data["Item_Weight"].values.reshape(-1,1)
test_processed_data = np.concat((test_processed_data, weight),axis=1)
otlt_id = onehotencode(test_data,column="Outlet_Identifier")
test_processed_data = np.concat((test_processed_data, otlt_id),axis=1)

#Try more features
visib = visib_quant_transformer.transform(test_data["Item_Visibility"].values.reshape(-1, 1))
test_processed_data = np.concat((test_processed_data, visib),axis=1)

X_test = test_processed_data
#y_test_predict = linear_model.predict(X_test)
#y_test_predict = random_forest_model.predict(X_test)
y_test_predict = boost_model.predict(X_test)
y_test = quant_transformer.inverse_transform(y_test_predict.reshape(-1, 1))
output_df = pd.DataFrame({
    "Item_Identifier": test_data["Item_Identifier"].values,
    "Outlet_Identifier": test_data["Outlet_Identifier"].values,
    "Item_Outlet_Sales": y_test.squeeze() 
})
output_df = output_df.reset_index(drop=True)
output_df.to_csv('submission.csv', index=False)

  lambda x: x.fillna(method='ffill').fillna(method='bfill'))
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  data["Item_Fat_Content"][data["Item_Fat_Content"] == key] = FAT_MAP[key]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas

(8523, 14) (8523, 1)


In [None]:
#Best hyperparameters: {'n_estimators': 397, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt'}
#Best score: -0.32020969816508693
import optuna
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error

def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 100, 500)
    max_depth = trial.suggest_int("max_depth", 5, 30)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 4)
    max_features = trial.suggest_categorical("max_features", ['sqrt', 'log2'])

    random_forest_model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42
    )

    scorer = make_scorer(mean_squared_error, greater_is_better=False)
    score = cross_val_score(random_forest_model, X, y, cv=5, scoring=scorer).mean()
    return score
study = optuna.create_study(direction="maximize")  # Or "minimize" for error metrics
study.optimize(objective, n_trials=100)
print("Best hyperparameters:", study.best_params)
print("Best score:", study.best_value)

[I 2025-08-31 01:07:36,246] A new study created in memory with name: no-name-4d13ab23-a8f1-4153-bd92-43b1446f7932
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
[I 2025-08-31 01:07:48,038] Trial 0 finished with value: -0.3797641249934953 and parameters: {'n_estimators': 480, 'max_depth': 5, 'min_samples_split': 6, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 0 with value: -0.3797641249934953.
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
[I 2025-08-31 01:08:27,694] Trial 1 finished with value: -0.3333607280780739 and parameters: {'n_estimators': 352, 'max_depth': 7, 'min_samples_split': 2,

Best hyperparameters: {'n_estimators': 397, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt'}
Best score: -0.32020969816508693


In [None]:
#Best hyperparameters: {'alpha': 0.32406351326583166, 'solver': 'cholesky'}
#Best score: -0.3227281585583012
def objective(trial):
    alpha = trial.suggest_float('alpha', 1e-5, 1e2, log=True)  # Log scale for wide range
    
    solver = trial.suggest_categorical('solver', ['auto', 'svd', 'cholesky', 'lsqr'])
    
    model = Ridge(alpha=alpha, solver=solver)
    
    score = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error').mean()
    
    return score

study = optuna.create_study(direction='maximize')

study.optimize(objective, n_trials=500)

print("Best hyperparameters:", study.best_params)
print("Best score:", study.best_value)

[I 2025-08-31 08:36:15,838] A new study created in memory with name: no-name-19caac00-8678-4281-93e8-b53195b6f8a5


[I 2025-08-31 08:36:15,900] Trial 0 finished with value: -0.32272831238405 and parameters: {'alpha': 2.0472871685865284e-05, 'solver': 'auto'}. Best is trial 0 with value: -0.32272831238405.
[I 2025-08-31 08:36:15,979] Trial 1 finished with value: -0.3227283122361729 and parameters: {'alpha': 0.0001764410409700349, 'solver': 'auto'}. Best is trial 1 with value: -0.3227283122361729.
[I 2025-08-31 08:36:16,065] Trial 2 finished with value: -0.32272823591134575 and parameters: {'alpha': 0.09439926842712996, 'solver': 'auto'}. Best is trial 2 with value: -0.32272823591134575.
[I 2025-08-31 08:36:16,177] Trial 3 finished with value: -0.32802032435474116 and parameters: {'alpha': 68.84223762779152, 'solver': 'auto'}. Best is trial 2 with value: -0.32272823591134575.
[I 2025-08-31 08:36:16,244] Trial 4 finished with value: -0.32272831026226917 and parameters: {'alpha': 0.002265586404523495, 'solver': 'cholesky'}. Best is trial 2 with value: -0.32272823591134575.
[I 2025-08-31 08:36:16,372] Tr

Best hyperparameters: {'alpha': 0.32406351326583166, 'solver': 'cholesky'}
Best score: -0.3227281585583012


In [None]:
import optuna
from sklearn.model_selection import cross_val_score

def objective(trial):
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1, log=True) 
    max_depth = trial.suggest_int('max_depth', 3, 10)
    n_estimators = trial.suggest_int('n_estimators', 50, 1000)
    reg_alpha = trial.suggest_float('reg_alpha', 1e-5, 1e2, log=True)
    reg_lambda = trial.suggest_float('reg_lambda', 1e-5, 1e2, log=True)
    subsample = trial.suggest_float('subsample', 0.5, 1.0)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1.0)
    
    model = xgb.XGBRegressor(
        learning_rate=learning_rate,
        max_depth=max_depth,
        n_estimators=n_estimators,
        reg_alpha=reg_alpha,
        reg_lambda=reg_lambda,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        objective='reg:squarederror',
        random_state=42
    )
    
    score = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error').mean()
    
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=1000)
print("Best hyperparameters:", study.best_params)
print("Best score:", study.best_value)

[I 2025-08-31 12:08:09,458] A new study created in memory with name: no-name-95735f2f-e29f-44e5-a845-b32fe5299a47


[I 2025-08-31 12:08:11,374] Trial 0 finished with value: -0.3291807911920772 and parameters: {'learning_rate': 0.0035956341283439816, 'max_depth': 4, 'n_estimators': 638, 'reg_alpha': 0.0012230724613442401, 'reg_lambda': 0.014416972871592377, 'subsample': 0.6859201741629875, 'colsample_bytree': 0.9684850299557937}. Best is trial 0 with value: -0.3291807911920772.
[I 2025-08-31 12:08:12,090] Trial 1 finished with value: -0.358447922715113 and parameters: {'learning_rate': 0.3394899845651958, 'max_depth': 4, 'n_estimators': 216, 'reg_alpha': 0.0010832622206253893, 'reg_lambda': 3.071621322670456e-05, 'subsample': 0.748974587252078, 'colsample_bytree': 0.5222396247740846}. Best is trial 0 with value: -0.3291807911920772.
[I 2025-08-31 12:08:13,073] Trial 2 finished with value: -0.3214493820761014 and parameters: {'learning_rate': 0.03371447615077897, 'max_depth': 7, 'n_estimators': 195, 'reg_alpha': 1.7684306586356064, 'reg_lambda': 3.301998065328949e-05, 'subsample': 0.6014317389179609, 

Best hyperparameters: {'learning_rate': 0.019401080789977414, 'max_depth': 3, 'n_estimators': 461, 'reg_alpha': 1.3795139557001301, 'reg_lambda': 0.0005078995163813296, 'subsample': 0.636241062819135, 'colsample_bytree': 0.5636488212542046}
Best score: -0.3106418943618606
