# Libraries

In [3]:
import pandas as pd
import numpy as np

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

# Load Data

In [6]:
# Load the datasets
train_df = pd.read_csv("./assets/ip/train_v9rqX0R.csv")
print(f"Train dataset size: {train_df.shape}")
display(train_df.info())
train_df.head()

Train dataset size: (8523, 12)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


None

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


# EDA

## Summary Stats

In [9]:
missing_values = train_df.isnull().sum().sort_values(ascending=False)
missing_values = missing_values[missing_values > 0]
missing_values

Outlet_Size    2410
Item_Weight    1463
dtype: int64

In [10]:
display(train_df['Item_Type'].value_counts()); print("\n\n")
display(train_df['Outlet_Identifier'].value_counts()); print("\n\n")
display(train_df['Outlet_Size'].value_counts(normalize=True)*100); print("\n\n")
display(train_df['Outlet_Location_Type'].value_counts(normalize=True)*100); print("\n\n")
display(train_df['Outlet_Type'].value_counts()); print("\n\n")

Item_Type
Fruits and Vegetables    1232
Snack Foods              1200
Household                 910
Frozen Foods              856
Dairy                     682
Canned                    649
Baking Goods              648
Health and Hygiene        520
Soft Drinks               445
Meat                      425
Breads                    251
Hard Drinks               214
Others                    169
Starchy Foods             148
Breakfast                 110
Seafood                    64
Name: count, dtype: int64






Outlet_Identifier
OUT027    935
OUT013    932
OUT049    930
OUT046    930
OUT035    930
OUT045    929
OUT018    928
OUT017    926
OUT010    555
OUT019    528
Name: count, dtype: int64






Outlet_Size
Medium    45.689514
Small     39.064289
High      15.246197
Name: proportion, dtype: float64






Outlet_Location_Type
Tier 3    39.305409
Tier 2    32.676288
Tier 1    28.018303
Name: proportion, dtype: float64






Outlet_Type
Supermarket Type1    5577
Grocery Store        1083
Supermarket Type3     935
Supermarket Type2     928
Name: count, dtype: int64






## Combine train and test for consistent encoding

In [12]:
# Reload test and sample submission files after environment reset
test_df = pd.read_csv("./assets/ip/test_AbJTz2l.csv")
submission_df = pd.read_csv("./assets/ip/sample_submission_8RXa3c6.csv")

In [13]:
# Combine train and test for consistent encoding
train_df['source'] = 'train'
test_df['source'] = 'test'
test_df['Item_Outlet_Sales'] = np.nan
combined = pd.concat([train_df, test_df], ignore_index=True)

## Data Cleaning

In [15]:
# Handle missing values
combined['Item_Weight'].fillna(combined['Item_Weight'].mean(), inplace=True)
combined['Outlet_Size'].fillna('Medium', inplace=True)

# Normalize inconsistent values
combined['Item_Fat_Content'] = combined['Item_Fat_Content'].replace({
    'low fat': 'Low Fat',
    'LF': 'Low Fat',
    'reg': 'Regular'
})

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined['Item_Weight'].fillna(combined['Item_Weight'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined['Outlet_Size'].fillna('Medium', inplace=True)


### Categorical Encoding

In [17]:
# Encode categorical features
categorical_cols = combined.select_dtypes(include='object').columns.drop(['Item_Identifier', 'source'])
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
combined[categorical_cols] = encoder.fit_transform(combined[categorical_cols])

# Split back
train_encoded = combined[combined['source'] == 'train'].drop(columns=['source'])
test_encoded = combined[combined['source'] == 'test'].drop(columns=['source', 'Item_Outlet_Sales'])

In [18]:
# Prepare features and target
X = train_encoded.drop(columns=['Item_Identifier', 'Item_Outlet_Sales'])
y = train_encoded['Item_Outlet_Sales']

## Train Test Split

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

In [21]:
# Train-test split for validation
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

## RF Regressor

In [23]:
from sklearn.ensemble import RandomForestRegressor

In [24]:
# Model: Random Forest
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions and evaluation
y_pred = model.predict(X_valid)
rf_regressor_rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
rf_regressor_rmse

1079.597643405983

In [25]:
print(f"RF Regressor RMSE: {rf_regressor_rmse}")

RF Regressor RMSE: 1079.597643405983


## XGBoost Regressos

In [27]:
# Train XGBoost model
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)
y_pred = model.predict(X_valid)
xgboost_rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
print(f"Train RMSE: {xgboost_rmse}")

Train RMSE: 1079.597643405983


In [28]:
# Predict on test set
test_features = test_encoded.drop(columns=['Item_Identifier'])
test_predictions = model.predict(test_features)

# Submission Preparation

In [30]:
# Prepare final submission DataFrame
submission_df = test_encoded[['Item_Identifier', 'Outlet_Identifier']].copy()
submission_df['Item_Outlet_Sales'] = test_predictions
submission_df.head()

Unnamed: 0,Item_Identifier,Outlet_Identifier,Item_Outlet_Sales
8523,FDW58,9.0,1790.322884
8524,FDW14,2.0,1059.847072
8525,NCN55,0.0,576.862436
8526,FDQ58,2.0,2585.055054
8527,FDY38,5.0,6377.252114


In [31]:
submission_df.to_csv("./Big_Mart_Sales_Prediction.csv", index=False)