In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
%matplotlib inline
warnings.filterwarnings("ignore")
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import statsmodels.api as sm


In [24]:
# importing data.
df = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [25]:
class CustomImputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.item_mean = None
        self.overall_mean = None
        self.group_modes = None
        self.type_modes = None

    def fit(self, X, y=None):
        # Step 1: Store mean per Item_Identifier and overall mean.
        self.item_mean = X.groupby("Item_Identifier")["Item_Weight"].mean()
        self.overall_mean = X["Item_Weight"].mean()

        # Step 2: Store mode by (Location, Type) and by only Type
        self.group_modes = (
            X.groupby(["Outlet_Location_Type", "Outlet_Type"])["Outlet_Size"]
            .agg(lambda x: x.mode()[0] if not x.mode().empty else np.nan)
        )
        self.type_modes = (
            X.groupby(["Outlet_Type"])["Outlet_Size"]
            .agg(lambda x: x.mode()[0] if not x.mode().empty else np.nan)
        )
        return self

    def transform(self, X):
        X = X.copy()

        # --- Item_Weight Imputation ---
        X["Item_Weight"] = X.apply(
            lambda row: self.item_mean.get(row["Item_Identifier"], np.nan)
            if pd.isnull(row["Item_Weight"]) else row["Item_Weight"],
            axis=1
        )
        X["Item_Weight"] = X["Item_Weight"].fillna(self.overall_mean)

        # --- Outlet_Size Imputation ---
        def fill_outlet_size(row):
            if pd.isnull(row["Outlet_Size"]):
                val = self.group_modes.get((row["Outlet_Location_Type"], row["Outlet_Type"]), np.nan)
                if pd.isnull(val):
                    val = self.type_modes.get(row["Outlet_Type"], np.nan)
                if pd.isnull(val):
                    val = "Unknown"
                return val
            return row["Outlet_Size"]

        X["Outlet_Size"] = X.apply(fill_outlet_size, axis=1)

        # ====== Handle Item_Visibility ======
        # Replace 0 with NaN
        X["Item_Visibility"].replace(0, np.nan, inplace=True)
        
        # Fill with mean visibility per Item_Identifier
        X["Item_Visibility"] = X.groupby("Item_Identifier")["Item_Visibility"].transform(
            lambda x: x.fillna(x.mean())
        )
        
        # Still NaN? Fill with overall mean
        X["Item_Visibility"].fillna(X["Item_Visibility"].mean(), inplace=True)
        
        # Cap extreme values at 99th percentile
        q99 = X["Item_Visibility"].quantile(0.99)
        X.loc[X["Item_Visibility"] > q99, "Item_Visibility"] = q99

        # # --- Fix inconsistent categories in Item_Fat_Content ---
        X["Item_Fat_Content"] = X["Item_Fat_Content"].replace({"low fat": "Low Fat","LF": "Low Fat","reg": "Regular"})
        
        # --- Final fallback for any leftover missing ---
        for col in X.columns:
            if X[col].dtype == "object":
                X[col] = X[col].fillna("Unknown")
            else:
                X[col] = X[col].fillna(0)

        return X



In [26]:
# --------------------------
# STEP 2: Feature Engineering
# --------------------------
import numpy as np

def feature_engineering(df):
    df = df.copy()

    # New Item Type
    df['New_Item_type'] = df['Item_Identifier'].apply(lambda x: x[:2])
    df['New_Item_type'] = df['New_Item_type'].map({'FD':'FOOD','NC':'Non-Consumable','DR':'Drinks'})

    # Outlet Age
    df['Outlet_Age'] = 2013 - df['Outlet_Establishment_Year']

    # Item Age (if identifier has digits)
    df['Item_Age'] = 2013 - df['Item_Identifier'].str.extract(r'(\d+)').astype(float)

    # Visibility Ratio
    df['Visibility_Avg'] = df.groupby('Item_Identifier')['Item_Visibility'].transform('mean')
    df['Visibility_Ratio'] = df['Item_Visibility'] / df['Visibility_Avg']

    # Log transform of Item_MRP
    df['Item_MRP_Log'] = np.log1p(df['Item_MRP'])

    # Interaction Features
    df['Item_Weight_x_Item_Visibility'] = df['Item_Weight'] * df['Item_Visibility']
    df['Item_Weight_x_Outlet_Age'] = df['Item_Weight'] * df['Outlet_Age']
    df['MRP_x_Visibility'] = df['Item_MRP'] * df['Item_Visibility']

    # Interactions with categorical variables
    df['Visibility_x_Size'] = df['Item_Visibility'] * df['Outlet_Size'].map({'Small':1, 'Medium':2, 'High':3})
    df['Visibility_x_Tier'] = df['Item_Visibility'] * df['Outlet_Location_Type'].map({'Tier 1':1, 'Tier 2':2, 'Tier 3':3})
    df['Visibility_x_OutletType'] = df['Item_Visibility'].astype(float) * df['Outlet_Type'].astype('category').cat.codes

    return df




In [27]:
## just significat column

from sklearn.base import BaseEstimator, TransformerMixin

class ColumnKeeper(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_keep=None):
        # Default columns to keep
        if columns_to_keep is None:
            self.columns_to_keep = [
                'Item_Outlet_Sales',
                'Outlet_Age',
                'Item_Age',
                'Outlet_Size',
                'Outlet_Location_Type',
                'Outlet_Type',
                'Item_MRP'
            ]
        else:
            self.columns_to_keep = columns_to_keep
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        # Only keep columns that exist in the dataframe
        existing_cols_to_keep = [col for col in self.columns_to_keep if col in X.columns]
        return X[existing_cols_to_keep]

In [28]:
pipeline = Pipeline([
    ('imputer', CustomImputer()),
    ('feature_eng', FunctionTransformer(feature_engineering)),
    ('ColumnKeeper', ColumnKeeper())
])

In [29]:
# Fit and transform
df_pro = pipeline.fit_transform(df)

In [9]:
df_pro.groupby('Outlet_Size')['Item_Outlet_Sales'].mean().reset_index(name='Avg_Sales')


Unnamed: 0,Outlet_Size,Avg_Sales
0,High,2298.995256
1,Medium,2681.603542
2,Small,1867.182814


In [10]:
df_pro.head()

Unnamed: 0,Item_Outlet_Sales,Outlet_Age,Item_Age,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_MRP
0,3735.138,14,1998.0,Medium,Tier 1,Supermarket Type1,249.8092
1,443.4228,4,2012.0,Medium,Tier 3,Supermarket Type2,48.2692
2,2097.27,14,1998.0,Medium,Tier 1,Supermarket Type1,141.618
3,732.38,15,2006.0,Small,Tier 3,Grocery Store,182.095
4,994.7052,26,1994.0,High,Tier 3,Supermarket Type1,53.8614


In [11]:
df_pro.groupby('Outlet_Type')['Item_Outlet_Sales'].mean().reset_index(name='Avg_Sales')


Unnamed: 0,Outlet_Type,Avg_Sales
0,Grocery Store,339.8285
1,Supermarket Type1,2316.181148
2,Supermarket Type2,1995.498739
3,Supermarket Type3,3694.038558


In [12]:
df_pro.groupby('Outlet_Location_Type')['Item_Outlet_Sales'].mean().reset_index(name='Avg_Sales')


Unnamed: 0,Outlet_Location_Type,Avg_Sales
0,Tier 1,1876.909159
1,Tier 2,2323.990559
2,Tier 3,2279.627651


In [13]:


class CategoricalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        # Predefined mappings based on your analysis
        self.outlet_size_map = {
            'High': 2,
            'Medium': 1,
            'Small': 0
        }
        self.outlet_type_map = {
            'Grocery Store': 0,
            'Supermarket Type1': 2,
            'Supermarket Type2': 1,
            'Supermarket Type3': 3
        }
        self.outlet_location_map = {
            'Tier 1': 0,
            'Tier 2': 2,
            'Tier 3': 1
        }
        self.categorical_columns = ['Outlet_Size', 'Outlet_Type', 'Outlet_Location_Type']

    def fit(self, X, y=None):
        # Nothing to learn, just return self
        return self

    def transform(self, X):
        X = X.copy()
        # Apply the pre-defined mappings
        if 'Outlet_Size' in X.columns:
            X['Outlet_Size'] = X['Outlet_Size'].map(self.outlet_size_map).fillna(0).astype(int)
        if 'Outlet_Type' in X.columns:
            X['Outlet_Type'] = X['Outlet_Type'].map(self.outlet_type_map).fillna(0).astype(int)
        if 'Outlet_Location_Type' in X.columns:
            X['Outlet_Location_Type'] = X['Outlet_Location_Type'].map(self.outlet_location_map).fillna(0).astype(int)
        return X


In [14]:
pipeline = Pipeline([
    ('imputer', CustomImputer()),
    ('feature_eng', FunctionTransformer(feature_engineering)),
    ('ColumnKeeper', ColumnKeeper()),
    ('CategoricalEncoder', CategoricalEncoder())
])

In [15]:
# -----------------------------
# STEP 1: Fit the pipeline on training data
# -----------------------------
pipeline.fit(df)  # only fit on training data

In [16]:
# -----------------------------
# STEP 2: Transform training data
# -----------------------------
df_train_transformed = pipeline.transform(df)

In [17]:
# -----------------------------
# STEP 3: Transform test data
# -----------------------------
x_test = pipeline.transform(test)

In [18]:
x_test.head()

Unnamed: 0,Outlet_Age,Item_Age,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_MRP
0,14,1955.0,1,0,2,107.8622
1,6,1999.0,0,2,2,87.3198
2,15,1958.0,0,1,0,241.7538
3,6,1955.0,0,2,2,155.034
4,28,1975.0,1,1,3,234.23


In [19]:
df_test_transformed

NameError: name 'df_test_transformed' is not defined

# now data is ready for the ML.

In [None]:
X = df_train_transformed.drop(columns=['Item_Outlet_Sales'])
y = df_train_transformed['Item_Outlet_Sales']

In [None]:
# Fit GLM (Gamma, log link)
glm_model = sm.GLM(y, X, family=sm.families.Gamma(sm.families.links.log()))
glm_results = glm_model.fit()



In [None]:
from sklearn.metrics import r2_score

In [None]:
# Deviance
deviance_val = glm_results.deviance

# AIC
aic_val = glm_results.aic

# McFadden R²
# Fit null (intercept-only) model
X_null = sm.add_constant(pd.DataFrame(index=df.index))  # only intercept
glm_null = sm.GLM(y, X_null, family=sm.families.Gamma(sm.families.links.log())).fit()

r2_mcfadden = 1 - (glm_results.deviance / glm_null.deviance)

# Adjusted McFadden R²
k = glm_results.df_model  # number of predictors
r2_mcfadden_adj = 1 - ((glm_results.deviance + 2*k) / glm_null.deviance)

# Predicted R² (squared correlation between predicted & actual)
y_pred = glm_results.fittedvalues
r2_pred = np.corrcoef(y, y_pred)[0,1]**2

# Display results
metrics = pd.DataFrame({
    'Deviance': [deviance_val],
    'AIC': [aic_val],
    'R2_McFadden': [r2_mcfadden],
    'R2_McFadden_Adj': [r2_mcfadden_adj],
    'R2_Pred': [r2_pred]
})

print(metrics.round(4))

In [None]:
# Compute influence measures
influence = glm_results.get_influence()
cooks_d = influence.cooks_distance[0]



In [None]:
# Threshold for influential points
n = len(df)
threshold = 4/n
influential_idx = np.where(cooks_d > threshold)[0]

print("Cook's distance for each observation:", cooks_d.shape)
print("Influential observation indices (Cook's D > 4/n):", influential_idx.shape)

In [None]:
# Remove influential observations from original dataframe
df_clean = df_train_transformed.drop(index=influential_idx).reset_index(drop=True)
print(df_train_transformed.shape)
print(df_clean.shape)
no_column_dorp = (df_train_transformed.shape[0])-(df_clean.shape[0])
(no_column_dorp)

In [None]:
X_clean = df_clean.drop(columns=['Item_Outlet_Sales'])
y_clean = df_clean['Item_Outlet_Sales']

In [20]:
glm_model_clean = sm.GLM(y_clean, X_clean, family=sm.families.Gamma(sm.families.links.log()))
glm_results_clean = glm_model_clean.fit()


NameError: name 'y_clean' is not defined

In [21]:

# Deviance
deviance_val_clean = glm_results_clean.deviance

# AIC
aic_val_clean = glm_results_clean.aic

# McFadden R²
X_null_clean = sm.add_constant(pd.DataFrame(index=df_clean.index))  # intercept only
glm_null_clean = sm.GLM(y_clean, X_null_clean, family=sm.families.Gamma(sm.families.links.log())).fit()

r2_mcfadden_clean = 1 - (glm_results_clean.deviance / glm_null_clean.deviance)

# Adjusted McFadden R²
k_clean = glm_results_clean.df_model  # number of predictors
r2_mcfadden_adj_clean = 1 - ((glm_results_clean.deviance + 2*k_clean) / glm_null_clean.deviance)

# Predicted R²
y_pred_clean = glm_results_clean.fittedvalues
r2_pred_clean = np.corrcoef(y_clean, y_pred_clean)[0,1]**2

# Display results
metrics_clean = pd.DataFrame({
    'Deviance': [deviance_val_clean],
    'AIC': [aic_val_clean],
    'R2_McFadden': [r2_mcfadden_clean],
    'R2_McFadden_Adj': [r2_mcfadden_adj_clean],
    'R2_Pred': [r2_pred_clean]
})

print(metrics_clean.round(4))

NameError: name 'glm_results_clean' is not defined

In [22]:
# Metrics for original model
metrics_orig = pd.DataFrame({
    'Deviance': [glm_results.deviance],
    'AIC': [glm_results.aic],
    'R2_McFadden': [1 - glm_results.deviance / glm_null.deviance],
    'R2_McFadden_Adj': [1 - (glm_results.deviance + 2*glm_results.df_model) / glm_null.deviance],
    'R2_Pred': [np.corrcoef(y, glm_results.fittedvalues)[0,1]**2]
}, index=['Original'])

# Metrics for cleaned model
metrics_clean_df = pd.DataFrame({
    'Deviance': [glm_results_clean.deviance],
    'AIC': [glm_results_clean.aic],
    'R2_McFadden': [1 - glm_results_clean.deviance / glm_null_clean.deviance],
    'R2_McFadden_Adj': [1 - (glm_results_clean.deviance + 2*glm_results_clean.df_model) / glm_null_clean.deviance],
    'R2_Pred': [np.corrcoef(y_clean, glm_results_clean.fittedvalues)[0,1]**2]
}, index=['Cleaned'])

# Combine for comparison
comparison_df = pd.concat([metrics_orig, metrics_clean_df])
print(comparison_df.round(4))


NameError: name 'glm_results' is not defined

In [23]:
from sklearn.metrics import mean_squared_error
# Predicted values for cleaned model
y_pred_clean = glm_results_clean.fittedvalues

# Train RMSE
train_rmse_clean = np.sqrt(mean_squared_error(y_clean, y_pred_clean))
print("Train RMSE (Cleaned Model):", round(train_rmse_clean, 4))

NameError: name 'glm_results_clean' is not defined

In [48]:
df_test_transformed

Unnamed: 0,Outlet_Age,Item_Age,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_MRP
0,14,1955.0,1,0,2,107.8622
1,6,1999.0,0,2,2,87.3198
2,15,1958.0,0,1,0,241.7538
3,6,1955.0,0,2,2,155.0340
4,28,1975.0,1,1,3,234.2300
...,...,...,...,...,...,...
5676,16,1955.0,0,0,2,141.3154
5677,4,1966.0,1,1,1,169.1448
5678,11,1996.0,0,2,2,118.7440
5679,6,1987.0,0,2,2,214.6218


In [58]:
# Assuming 'pred' contains predictions from glm_model_clean
submission = pd.DataFrame({
    'Item_Identifier': test['Item_Identifier'],  # replace with actual test ID column
    'Outlet_Identifier': test['Outlet_Identifier'],  # test outlet IDs
    'Item_Outlet_Sales': y_test_pred  # predictions from the model
})

# Save as CSV
submission.to_csv("submission.csv", index=False)

print("Submission file created successfully!")


Submission file created successfully!


In [59]:
y_test_pred

0       1629.722304
1       1453.111875
2        692.514461
3       2279.357435
4       6818.387055
           ...     
5676    1604.753704
5677    1608.290681
5678    1588.326540
5679    4067.829543
5680    1090.310591
Length: 5681, dtype: float64

NameError: name 'lgbm_model' is not defined