In [None]:
# -------------------   Reading the Test and Train files that is in the CSV file format -----------------

import pandas as pd

Train_path = r"C:\Users\91995\Downloads\train_v9rqX0R.csv"
train_df = pd.read_csv(Train_path)

Test_path = r"C:\Users\91995\Downloads\test_AbJTz2l.csv"
test_df = pd.read_csv(Test_path)

display(train_df.head(2))

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228


In [20]:
# -------------------   Data Cleaning & Missing Value Treatment  -----------------
import numpy as np

# ---Create Mapping for Item_Fat_Content

def normalize_fat_content(value):
    if pd.isna(value):
        return value

    value = str(value).strip().lower()

    if "low" in value or value in {"lf", "l.f", "lowfat"}:
        return "Low Fat"
    else:
        return "Regular"
    

# ---Handle Missing features or variables

def fill_with_group_mode(series):
    mode = series.mode()
    return series.fillna(mode.iloc[0] if not mode.empty else series)
    

# ---Handle missing Item_Weight

def fill_item_weight(df):
    # 1. Fill using Item_Type median
    df["Item_Weight"] = (
        df.groupby("Item_Type")["Item_Weight"]
          .transform(lambda x: x.fillna(x.median()) if not np.isnan(x.median()) else x)
    )

    # 2. Global median fallback
    global_median = df["Item_Weight"].median()
    df["Item_Weight"] = df["Item_Weight"].fillna(global_median)

    return df


# ---Handle zero Item_Visibility

def fill_item_visibility(df):
    # 1. Replace 0 with NaN
    df["Item_Visibility"] = df["Item_Visibility"].replace(0, np.nan)

    # 2. Fill using Item_Type median
    df["Item_Visibility"] = (
        df.groupby("Item_Type")["Item_Visibility"]
          .transform(lambda x: x.fillna(x.median()) if not np.isnan(x.median()) else x)
    )

    # 3. Global median fallback
    global_median = df["Item_Visibility"].median()
    df["Item_Visibility"] = df["Item_Visibility"].fillna(global_median)

    return df


train_df["Item_Fat_Content"] = train_df["Item_Fat_Content"].apply(normalize_fat_content)
test_df["Item_Fat_Content"] = test_df["Item_Fat_Content"].apply(normalize_fat_content)

train_df["Outlet_Size"] = (
    train_df.groupby("Outlet_Type")["Outlet_Size"]
      .transform(fill_with_group_mode)
)

test_df["Outlet_Size"] = (
    test_df.groupby("Outlet_Type")["Outlet_Size"]
      .transform(fill_with_group_mode)
)

test_df = fill_item_weight(test_df)
train_df = fill_item_weight(train_df)

test_df = fill_item_visibility(test_df)
train_df = fill_item_visibility(train_df)

# ---Concatenating both Test and train dataset

train_df["control"] = "Train"
test_df["control"] = "Test"

full_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)


display(full_df.head(2))

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,control
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138,Train
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,Train


In [21]:
# -------------------   Feature Engineering & Data Transformation  -----------------

# --    Creating Outlet Age

full_df['Outlet_Age'] = 2013 - full_df['Outlet_Establishment_Year']

# --    Price Bands from Item_MRP

full_df['Item_MRP_Band'] = pd.qcut(full_df['Item_MRP'], 4,
                              labels=['Low', 'Medium', 'High', 'Very High'])

# ---   Visibility Ratio

full_df['Item_Visibility_Ratio'] = full_df['Item_Visibility'] / \
                              full_df.groupby('Item_Type')['Item_Visibility'].transform('mean')

# --    Calculating Proxy Quantity

full_df['Item_Quantity'] = full_df['Item_Outlet_Sales'] / full_df['Item_MRP']



display(full_df.head(2))


Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,control,Outlet_Age,Item_MRP_Band,Item_Visibility_Ratio,Item_Quantity
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138,Train,14,Very High,0.213522,14.951963
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,Train,4,Low,0.279895,9.186454


In [22]:
# -------------------   Clustering the dataframe both at Product and Outlet level  -----------------

# OUTLET + PRODUCT CLUSTERING

import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans

# ------------------------------
# OUTLET CLUSTERING
# ------------------------------

# Prepare outlet-level dataframe
outlet_df = full_df[[
    'Outlet_Identifier',
    'Outlet_Age',
    'Outlet_Size',
    'Outlet_Location_Type',
    'Outlet_Type'
]].drop_duplicates()



# Encode categorical outlet features
for col in ['Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']:
    le = LabelEncoder()
    outlet_df[col] = le.fit_transform(outlet_df[col])

# Scale outlet features
scaler_outlet = StandardScaler()
outlet_scaled = scaler_outlet.fit_transform(
    outlet_df.drop('Outlet_Identifier', axis=1)
)

# KMeans for outlets
kmeans_outlet = KMeans(
    n_clusters=5,
    random_state=42,
    n_init=10
)

outlet_df['Outlet_Cluster'] = kmeans_outlet.fit_predict(outlet_scaled)

# Merge outlet clusters back
full_df = full_df.merge(
    outlet_df[['Outlet_Identifier', 'Outlet_Cluster']],
    on='Outlet_Identifier',
    how='left'
)

# ------------------------------
# PRODUCT (ITEM) CLUSTERING
# ------------------------------

# Prepare item-level dataframe
item_df = full_df[[
    'Item_Identifier',
    'Item_MRP',
    'Item_Visibility',
    'Item_Weight'
]].drop_duplicates()

# Scale item features
scaler_item = StandardScaler()
item_scaled = scaler_item.fit_transform(
    item_df.drop('Item_Identifier', axis=1)
)

# KMeans for items
kmeans_item = KMeans(
    n_clusters=5,
    random_state=42,
    n_init=10
)

item_df['Item_Cluster'] = kmeans_item.fit_predict(item_scaled)

# Merge item clusters back
full_df = full_df.merge(
    item_df[['Item_Identifier', 'Item_Cluster']],
    on='Item_Identifier',
    how='left'
)


full_df["final_cluster"] = (
    full_df["Outlet_Cluster"].astype(str) + "_" +
    full_df["Item_Cluster"].astype(str)
)


full_df = full_df.drop_duplicates().reset_index(drop=True)

display(full_df.head(2))



Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,control,Outlet_Age,Item_MRP_Band,Item_Visibility_Ratio,Item_Quantity,Outlet_Cluster,Item_Cluster,final_cluster
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138,Train,14,Very High,0.213522,14.951963,1,4,1_4
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,Train,4,Low,0.279895,9.186454,0,0,0_0


In [37]:
# -------------------   Modeling based on XGBOOST and then by Hyperparameter tuning  -----------------


# -------------------   Cluster-wise XGBOOST Modeling  -----------------

import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

# ------------------------------
# TRAIN / TEST SPLIT (CONTROL)
# ------------------------------

train_df = full_df[full_df['control'] == 'Train'].copy()
test_df  = full_df[full_df['control'] == 'Test'].copy()

# Storage for results
train_preds_list = []
test_preds_list = []

# For overall RMSE calculation
y_train_all = []
train_pred_all = []


# Unique clusters
clusters = train_df['final_cluster'].unique()

for clus in clusters:
    # print(f"\n=== Processing final_cluster = {clus} ===")
    
    # Filter train & test for this cluster
    X_train = train_df[train_df['final_cluster'] == clus].drop(['Item_Outlet_Sales','control'], axis=1)
    y_train = train_df[train_df['final_cluster'] == clus]['Item_Outlet_Sales']
    
    X_test = test_df[test_df['final_cluster'] == clus].drop(['Item_Outlet_Sales','control'], axis=1, errors='ignore')
    
    # ----  One-Hot Encode low-cardinality columns
    low_card_cols = ['Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']
    X_train = pd.get_dummies(X_train, columns=low_card_cols, drop_first=True)
    X_test = pd.get_dummies(X_test, columns=low_card_cols, drop_first=True)
    
    # Align columns
    X_test = X_test.reindex(columns=X_train.columns, fill_value=0)
    
    # ---- Label Encode high-cardinality columns
    high_card_cols = ['Item_Identifier', 'final_cluster', 'Outlet_Identifier']
    for col in high_card_cols:
        X_train[col] = X_train[col].astype('category').cat.codes
        X_test[col] = X_test[col].astype('category').cat.codes
    
    # ------------------------------
    # XGBOOST MODEL
    # ------------------------------
    xgb_model = XGBRegressor(
        objective='reg:squarederror',
        n_estimators=500,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        enable_categorical=True
    )
    
    # Fit model
    xgb_model.fit(X_train, y_train)
    
    # ------------------------------
    # PREDICT ON TRAIN (for RMSE)
    # ------------------------------
    train_preds = xgb_model.predict(X_train)
    train_rmse = np.sqrt(mean_squared_error(y_train, train_preds))
    # print(f"Train RMSE for cluster {clus}: {train_rmse:.2f}")

    # Save for overall RMSE
    y_train_all.extend(y_train)
    train_pred_all.extend(train_preds)
    
    train_preds_df = train_df[train_df['final_cluster'] == clus].copy()
    train_preds_df['Item_Outlet_Sales_Predicted'] = train_preds
    train_preds_list.append(train_preds_df)
    
    # ------------------------------
    # PREDICT ON TEST
    # ------------------------------
    test_preds = xgb_model.predict(X_test)
    test_preds_df = test_df[test_df['final_cluster'] == clus].copy()
    test_preds_df['Item_Outlet_Sales_Predicted'] = test_preds
    test_preds_list.append(test_preds_df)

# ------------------------------
# COMBINE ALL CLUSTERS
# ------------------------------
train_df_final = pd.concat(train_preds_list)
test_df_final  = pd.concat(test_preds_list)

# ------------------------------
# OVERALL TRAIN RMSE
# ------------------------------
overall_rmse = np.sqrt(mean_squared_error(y_train_all, train_pred_all))
print(f"\n=== Overall Train RMSE across all clusters: {overall_rmse:.2f} ===")

# ------------------------------
# SANITY CHECK
# ------------------------------
# print("\nTrain predictions head:")
# print(train_df_final[['Item_Identifier','Outlet_Identifier','Item_Outlet_Sales','Item_Outlet_Sales_Predicted']].head())

# print("\nTest predictions head:")
# print(test_df_final[['Item_Identifier','Outlet_Identifier','Item_Outlet_Sales_Predicted']].head())




=== Overall Train RMSE across all clusters: 4.75 ===



=== Overall Train RMSE across all clusters: 4.75 ===


In [None]:
# X_train.dtypes[X_train.dtypes == "object"]

print(X_train.dtypes.value_counts())

In [34]:
# Example: df is your DataFrame
file_path = r"C:\Users\91995\Downloads\test_data.csv"

# Export DataFrame to CSV
test_df_final.to_csv(file_path, index=False)  # index=False avoids writing row numbers

print(f"File saved successfully at {file_path}")

File saved successfully at C:\Users\91995\Downloads\test_data.csv
