#### Imports + Load Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

train_df = pd.read_csv(r"D:\ABB_problem_statement\train_v9rqX0R.csv")
test_df = pd.read_csv(r"D:\ABB_problem_statement\test_AbJTz2l.csv")

#### Basic Cleaning

In [3]:
def basic_cleaning(df):
    df = df.copy()

    df["Item_Fat_Content"] = df["Item_Fat_Content"].replace({
        "low fat": "Low Fat",
        "LF": "Low Fat",
        "reg": "Regular"
    })

    df["Item_Group"] = df["Item_Identifier"].str[:2].map({
        "FD": "Food",
        "DR": "Drinks",
        "NC": "Non-Consumable"
    })

    df.loc[df["Item_Group"] == "Non-Consumable", "Item_Fat_Content"] = "NA"
    return df


train_df = basic_cleaning(train_df)
test_df = basic_cleaning(test_df)

#### Train-only imputation
##### Statistics learned only from training data to prevent leakage.

In [4]:
weight_map = train_df.groupby("Item_Identifier")["Item_Weight"].median()
global_weight = train_df["Item_Weight"].median()

for df in [train_df, test_df]:
    df["Item_Weight"] = df["Item_Weight"].fillna(
        df["Item_Identifier"].map(weight_map)
    ).fillna(global_weight)

outlet_size_map = (
    train_df.groupby("Outlet_Type")["Outlet_Size"]
    .agg(lambda x: x.mode()[0])
)

for df in [train_df, test_df]:
    df["Outlet_Size"] = df["Outlet_Size"].fillna(
        df["Outlet_Type"].map(outlet_size_map)
    )

visibility_map = train_df.groupby("Item_Type")["Item_Visibility"].median()

for df in [train_df, test_df]:
    zero_mask = df["Item_Visibility"] == 0
    df.loc[zero_mask, "Item_Visibility"] = (
        df.loc[zero_mask, "Item_Type"].map(visibility_map)
    )


#### Feature engineering

##### -Outlet maturity → store stability
##### -Visibility normalization → shelf exposure
##### -Relative price → competitive positioning
##### -Item–Outlet exposure → demand context

In [5]:
def engineer_features(df):
    df = df.copy()

    df["Outlet_Age"] = 2013 - df["Outlet_Establishment_Year"]
    df["Outlet_Maturity_Score"] = np.log1p(df["Outlet_Age"])

    df["Visibility_Index"] = (
        df["Item_Visibility"] /
        df.groupby("Item_Type")["Item_Visibility"].transform("mean")
    )

    df["Outlet_Visibility_Ratio"] = (
        df["Item_Visibility"] /
        df.groupby("Outlet_Identifier")["Item_Visibility"].transform("mean")
    )

    df["Price_Weight_Ratio"] = df["Item_MRP"] / df["Item_Weight"]

    df["Price_Relative_Mean"] = (
        df["Item_MRP"] /
        df.groupby("Item_Type")["Item_MRP"].transform("mean")
    )

    df["Item_Type_Density"] = (
        df.groupby("Item_Type")["Item_Type"].transform("count") / df.shape[0]
    )

    df["Is_LowFat_Food"] = (
        (df["Item_Fat_Content"] == "Low Fat") &
        (df["Item_Group"] == "Food")
    ).astype(int)

    df["Visibility_Quality"] = np.where(
        df["Item_Visibility"] < 0.01, 0,
        np.where(df["Item_Visibility"] > 0.2, 0.5, 1)
    )

    df["Outlet_Item_Variety"] = (
        df.groupby("Outlet_Identifier")["Item_Type"].transform("nunique")
    )

    df["MRP_Band"] = pd.qcut(
        df["Item_MRP"], q=4,
        labels=["Low", "Medium", "High", "Premium"]
    ).astype(str)

    df["Item_Outlet_Exposure"] = (
        df.groupby(["Item_Type", "Outlet_Type"])["Item_Visibility"]
        .transform("mean")
    )

    return df


train_df = engineer_features(train_df)
test_df = engineer_features(test_df)

In [6]:
drop_cols = [
    "Item_Identifier",
    "Outlet_Identifier",
    "Outlet_Establishment_Year"
]

train_df.drop(columns=drop_cols, inplace=True)
test_df.drop(columns=drop_cols, inplace=True)