In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
%matplotlib inline
warnings.filterwarnings("ignore")
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import statsmodels.api as sm


In [2]:
# importing data.
df = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [3]:
#-------------------- filling missing---------------------------
class CustomImputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.item_mean = None
        self.overall_mean = None
        self.group_modes = None
        self.type_modes = None

    def fit(self, X, y=None):
        # Step 1: Store mean per Item_Identifier and overall mean.
        self.item_mean = X.groupby("Item_Identifier")["Item_Weight"].mean()
        self.overall_mean = X["Item_Weight"].mean()

        # Step 2: Store mode by (Location, Type) and by only Type
        self.group_modes = (
            X.groupby(["Outlet_Location_Type", "Outlet_Type"])["Outlet_Size"]
            .agg(lambda x: x.mode()[0] if not x.mode().empty else np.nan)
        )
        self.type_modes = (
            X.groupby(["Outlet_Type"])["Outlet_Size"]
            .agg(lambda x: x.mode()[0] if not x.mode().empty else np.nan)
        )
        return self

    def transform(self, X):
        X = X.copy()

        # --- Item_Weight Imputation ---
        X["Item_Weight"] = X.apply(
            lambda row: self.item_mean.get(row["Item_Identifier"], np.nan)
            if pd.isnull(row["Item_Weight"]) else row["Item_Weight"],
            axis=1
        )
        X["Item_Weight"] = X["Item_Weight"].fillna(self.overall_mean)

        # --- Outlet_Size Imputation ---
        def fill_outlet_size(row):
            if pd.isnull(row["Outlet_Size"]):
                val = self.group_modes.get((row["Outlet_Location_Type"], row["Outlet_Type"]), np.nan)
                if pd.isnull(val):
                    val = self.type_modes.get(row["Outlet_Type"], np.nan)
                if pd.isnull(val):
                    val = "Unknown"
                return val
            return row["Outlet_Size"]

        X["Outlet_Size"] = X.apply(fill_outlet_size, axis=1)

        # ====== Handle Item_Visibility ======
        # Replace 0 with NaN
        X["Item_Visibility"].replace(0, np.nan, inplace=True)
        
        # Fill with mean visibility per Item_Identifier
        X["Item_Visibility"] = X.groupby("Item_Identifier")["Item_Visibility"].transform(
            lambda x: x.fillna(x.mean())
        )
        
        # Still NaN? Fill with overall mean
        X["Item_Visibility"].fillna(X["Item_Visibility"].mean(), inplace=True)
        
        # Cap extreme values at 99th percentile
        q99 = X["Item_Visibility"].quantile(0.99)
        X.loc[X["Item_Visibility"] > q99, "Item_Visibility"] = q99

        # # --- Fix inconsistent categories in Item_Fat_Content ---
        X["Item_Fat_Content"] = X["Item_Fat_Content"].replace({"low fat": "LF"})
        
        # --- Final fallback for any leftover missing ---
        for col in X.columns:
            if X[col].dtype == "object":
                X[col] = X[col].fillna("Unknown")
            else:
                X[col] = X[col].fillna(0)

        return X



In [4]:
# --------------------------
# STEP 2: Feature Engineering
# --------------------------

def feature_engineering(df):
    df = df.copy()
    
    # Your existing features
    df['New_Item_type'] = df['Item_Identifier'].apply(lambda x: x[:2])
    df['New_Item_type'] = df['New_Item_type'].map({'FD':'FOOD','NC':'Non-Consumable','DR':'Drinks'})
    df['Outlet_Age'] = 2013 - df['Outlet_Establishment_Year']
    df['Item_Age'] = 2013 - df['Item_Identifier'].str.extract(r'(\d+)').astype(float)
    df['Visibility_Avg'] = df.groupby('Item_Identifier')['Item_Visibility'].transform('mean')
    df['Visibility_Ratio'] = df['Item_Visibility'] / df['Visibility_Avg']
    df['Item_Weight_x_Item_Visibility'] = df['Item_Weight'] * df['Item_Visibility']
    df['Item_Weight_x_Outlet_Age'] = df['Item_Weight'] * df['Outlet_Age']
    df['MRP_x_Visibility'] = df['Item_MRP'] * df['Item_Visibility']
    df['Visibility_x_Size'] = df['Item_Visibility'] * df['Outlet_Size'].map({'Small':1, 'Medium':2, 'High':3})
    df['Visibility_x_Tier'] = df['Item_Visibility'] * df['Outlet_Location_Type'].map({'Tier 1':1, 'Tier 2':2, 'Tier 3':3})
    df['Visibility_x_OutletType'] = df['Item_Visibility'].astype(float) * df['Outlet_Type'].astype('category').cat.codes
    
    # =========================
    # NEW HIGH-IMPACT FEATURES
    # =========================
    
    # 2. STATISTICAL AGGREGATIONS
    # MRP statistics by categories
    df['Avg_MRP_by_Item_Type'] = df.groupby('New_Item_type')['Item_MRP'].transform('mean')
    df['Avg_MRP_by_Outlet_Type'] = df.groupby('Outlet_Type')['Item_MRP'].transform('mean')
    df['MRP_vs_Category_Avg'] = df['Item_MRP'] / df['Avg_MRP_by_Item_Type']
    
    # Visibility statistics
    df['Avg_Visibility_by_Outlet'] = df.groupby('Outlet_Identifier')['Item_Visibility'].transform('mean')
    df['Visibility_vs_Outlet_Avg'] = df['Item_Visibility'] / df['Avg_Visibility_by_Outlet']
    
    # 3. PRICE POSITIONING FEATURES
    df['MRP_Percentile_by_Item_Type'] = df.groupby('New_Item_type')['Item_MRP'].rank(pct=True)
    df['MRP_Percentile_by_Outlet'] = df.groupby('Outlet_Type')['Item_MRP'].rank(pct=True)
    
    # 4. BINNED FEATURES (often improve tree models)
    df['MRP_Bins'] = pd.qcut(df['Item_MRP'], q=5, labels=False, duplicates='drop')
    df['Weight_Bins'] = pd.qcut(df['Item_Weight'], q=5, labels=False, duplicates='drop')
    df['Visibility_Bins'] = pd.qcut(df['Item_Visibility'], q=5, labels=False, duplicates='drop')
    df['Age_Bins'] = pd.cut(df['Outlet_Age'], bins=[0, 5, 15, 25, 35], labels=False)
    
    # 5. ADVANCED INTERACTION FEATURES
    # Multi-way interactions
    df['MRP_x_Age_x_Size'] = df['Item_MRP'] * df['Outlet_Age'] * df['Outlet_Size'].map({'Small':1, 'Medium':2, 'High':3})
    df['Weight_x_Visibility_x_Type'] = df['Item_Weight'] * df['Item_Visibility'] * df['New_Item_type'].map({'FOOD':1, 'Drinks':2, 'Non-Consumable':3})
    
    # Type-specific pricing
    df['MRP_x_Item_Type'] = df['Item_MRP'] * df['New_Item_type'].map({'FOOD':1, 'Drinks':2, 'Non-Consumable':3})
    df['Age_x_Location_Type'] = df['Outlet_Age'] * df['Outlet_Location_Type'].map({'Tier 1':3, 'Tier 2':2, 'Tier 3':1})
    
    # 6. RATIO FEATURES
    df['Weight_to_MRP_Ratio'] = df['Item_Weight'] / (df['Item_MRP'] + 1)
    df['Visibility_to_Weight_Ratio'] = df['Item_Visibility'] / (df['Item_Weight'] + 1)
    df['MRP_to_Outlet_Age_Ratio'] = df['Item_MRP'] / (df['Outlet_Age'] + 1)
    
    # 7. POLYNOMIAL FEATURES for key numerical variables
    df['Item_MRP_Squared'] = df['Item_MRP'] ** 2
    df['Item_Weight_Squared'] = df['Item_Weight'] ** 2
    df['Outlet_Age_Squared'] = df['Outlet_Age'] ** 2
    
    # 8. OUTLET PERFORMANCE INDICATORS
    # Count of items per outlet (proxy for store size/variety)
    df['Items_per_Outlet'] = df.groupby('Outlet_Identifier')['Item_Identifier'].transform('count')
    
    # Average weight of items in outlet
    df['Avg_Weight_by_Outlet'] = df.groupby('Outlet_Identifier')['Item_Weight'].transform('mean')
    df['Weight_vs_Outlet_Avg'] = df['Item_Weight'] / df['Avg_Weight_by_Outlet']
    
    # 9. ITEM POPULARITY FEATURES
    # How many outlets sell this item (popularity indicator)
    df['Item_Popularity'] = df.groupby('Item_Identifier')['Outlet_Identifier'].transform('nunique')
    df['Item_Popularity_Ratio'] = df['Item_Popularity'] / df['Item_Popularity'].max()
    
    # 10. COMPOSITE FEATURES
    # Create a "Premium Index"
    df['Premium_Index'] = (df['Item_MRP'] * df['Item_Weight']) / (df['Item_Visibility'] + 0.001)
    
    # Market positioning
    df['Market_Position'] = df['MRP_Percentile_by_Item_Type'] * df['Visibility_Ratio']
    
    return df

In [5]:
## just significat column

from sklearn.base import BaseEstimator, TransformerMixin

class ColumnKeeper(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_keep=None):
        # Default columns to keep
        if columns_to_keep is None:
            self.columns_to_keep = [
                'Item_Outlet_Sales',
                'Outlet_Age',
                'Item_Age',
                'Outlet_Size',
                'Outlet_Location_Type',
                'Outlet_Type',
                'Item_MRP',
                'Item_Fat_Content'
                
]
        else:
            self.columns_to_keep = columns_to_keep
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        # Only keep columns that exist in the dataframe
        existing_cols_to_keep = [col for col in self.columns_to_keep if col in X.columns]
        return X[existing_cols_to_keep]

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin

class CategoricalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        # Predefined mappings
        self.outlet_size_map = {
            'High': 2,
            'Medium': 1,
            'Small': 0
        }
        self.outlet_type_map = {
            'Grocery Store': 0,
            'Supermarket Type1': 2,
            'Supermarket Type2': 1,
            'Supermarket Type3': 3
        }
        self.outlet_location_map = {
            'Tier 1': 0,
            'Tier 2': 2,
            'Tier 3': 1
        }
        self.item_fat_map = {
            'LF': 1,
            'Low Fat': 2,
            'Regular': 3,
            'reg': 0
        }
        

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        # Apply categorical encodings
        if 'Outlet_Size' in X.columns:
            X['Outlet_Size'] = X['Outlet_Size'].map(self.outlet_size_map).fillna(0).astype(int)

        if 'Outlet_Type' in X.columns:
            X['Outlet_Type'] = X['Outlet_Type'].map(self.outlet_type_map).fillna(0).astype(int)

        if 'Outlet_Location_Type' in X.columns:
            X['Outlet_Location_Type'] = X['Outlet_Location_Type'].map(self.outlet_location_map).fillna(0).astype(int)

        if 'Item_Fat_Content' in X.columns:
            X['Item_Fat_Content'] = X['Item_Fat_Content'].map(self.item_fat_map).fillna(0).astype(int)

        
        return X


In [7]:
pipeline = Pipeline([
    ('imputer', CustomImputer()),
    ('feature_eng', FunctionTransformer(feature_engineering)),
    ('ColumnKeeper', ColumnKeeper()),
    ('CategoricalEncoder', CategoricalEncoder())
])

In [8]:
# -----------------------------
# STEP 1: Fit the pipeline on training data
# -----------------------------
pipeline.fit(df)  # only fit on training data

In [9]:
# -----------------------------
# STEP 2: Transform training data
# -----------------------------
df_train_transformed = pipeline.transform(df)

In [10]:
# -----------------------------
# STEP 3: Transform test data
# -----------------------------
x_test = pipeline.transform(test)


In [11]:
X = df_train_transformed.drop(columns=['Item_Outlet_Sales'])
y = df_train_transformed['Item_Outlet_Sales']

In [12]:
import lightgbm as lgb
# Initialize the LGBMRegressor with your specified parameters
lgbm_model = lgb.LGBMRegressor(
    n_estimators=100, 
    random_state=42, 
    verbose=-1
)

In [14]:

# Fit the model on your training data
# Replace X_train and y_train with your actual feature matrix and target variable
print("Training the LGBMRegressor model...")
lgbm_model.fit(X,y)

Training the LGBMRegressor model...


In [15]:
# Make predictions on the test set
print("Making predictions on test data...")
y_test_pred = lgbm_model.predict(x_test)

Making predictions on test data...


In [16]:

y_pred = lgbm_model.predict(X)
residuals = y - y_pred


In [17]:
std_resid = residuals / np.std(residuals)
influential_idx = np.where(np.abs(std_resid) > 0.2)[0]  # threshold can be 2.5 or 3
influential_idx

array([   0,    1,    2, ..., 8519, 8521, 8522])

In [18]:
X_train_clean = X.drop(index=influential_idx)
y_train_clean = y.drop(index=influential_idx)
X_train_clean.head()

Unnamed: 0,Outlet_Age,Item_Age,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_MRP,Item_Fat_Content
3,15,2006.0,0,1,0,182.095,3
4,26,1994.0,2,1,2,53.8614,2
5,4,1977.0,1,1,1,51.4008,3
11,16,2010.0,0,0,2,144.1102,3
16,4,1971.0,1,1,1,115.3492,2


In [19]:
lgbm_model1 = lgb.LGBMRegressor(
    n_estimators=100, 
    random_state=42, 
    verbose=-1
)
lgbm_model_clean = lgbm_model1.fit(X_train_clean, y_train_clean)


In [20]:
y_test_pred_clean = lgbm_model_clean.predict(x_test)

In [21]:
submission_best_clean = pd.DataFrame({
    'Item_Identifier': test['Item_Identifier'],
    'Outlet_Identifier': test['Outlet_Identifier'],
    'Item_Outlet_Sales': y_test_pred_clean
})
# Save as CSV
submission_best_clean.to_csv("submission_best_clean.csv", index=False)

print("Submission file created successfully!")

Submission file created successfully!
