In [None]:
# -------------------   Reading the Test and Train files that is in the CSV file format -----------------

import pandas as pd

Train_path = r"C:\Users\91995\Downloads\train_v9rqX0R.csv"
train_df = pd.read_csv(Train_path)

Test_path = r"C:\Users\91995\Downloads\test_AbJTz2l.csv"
test_df = pd.read_csv(Test_path)

display(train_df.head(2))

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228


In [5]:
# -------------------   Data Cleaning & Missing Value Treatment  -----------------
import numpy as np

# ---Create Mapping for Item_Fat_Content

def normalize_fat_content(value):
    if pd.isna(value):
        return value

    value = str(value).strip().lower()

    if "low" in value or value in {"lf", "l.f", "lowfat"}:
        return "Low Fat"
    else:
        return "Regular"
    

# ---Handle Missing features or variables

def fill_with_group_mode(series):
    mode = series.mode()
    return series.fillna(mode.iloc[0] if not mode.empty else series)
    

# ---Handle missing Item_Weight

def fill_item_weight(df):
    # 1. Fill using Item_Type median
    df["Item_Weight"] = (
        df.groupby("Item_Type")["Item_Weight"]
          .transform(lambda x: x.fillna(x.median()) if not np.isnan(x.median()) else x)
    )

    # 2. Global median fallback
    global_median = df["Item_Weight"].median()
    df["Item_Weight"] = df["Item_Weight"].fillna(global_median)

    return df


# ---Handle zero Item_Visibility

def fill_item_visibility(df):
    # 1. Replace 0 with NaN
    df["Item_Visibility"] = df["Item_Visibility"].replace(0, np.nan)

    # 2. Fill using Item_Type median
    df["Item_Visibility"] = (
        df.groupby("Item_Type")["Item_Visibility"]
          .transform(lambda x: x.fillna(x.median()) if not np.isnan(x.median()) else x)
    )

    # 3. Global median fallback
    global_median = df["Item_Visibility"].median()
    df["Item_Visibility"] = df["Item_Visibility"].fillna(global_median)

    return df


train_df["Item_Fat_Content"] = train_df["Item_Fat_Content"].apply(normalize_fat_content)
test_df["Item_Fat_Content"] = test_df["Item_Fat_Content"].apply(normalize_fat_content)

train_df["Outlet_Size"] = (
    train_df.groupby("Outlet_Type")["Outlet_Size"]
      .transform(fill_with_group_mode)
)

test_df["Outlet_Size"] = (
    test_df.groupby("Outlet_Type")["Outlet_Size"]
      .transform(fill_with_group_mode)
)

test_df = fill_item_weight(test_df)
train_df = fill_item_weight(train_df)

test_df = fill_item_visibility(test_df)
train_df = fill_item_visibility(train_df)


display(train_df.head(2))

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
