In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
%matplotlib inline
warnings.filterwarnings("ignore")
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import statsmodels.api as sm


In [197]:
# importing data.
df = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [198]:
#-------------------- filling missing---------------------------
class CustomImputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.item_mean = None
        self.overall_mean = None
        self.group_modes = None
        self.type_modes = None

    def fit(self, X, y=None):
        # Step 1: Store mean per Item_Identifier and overall mean.
        self.item_mean = X.groupby("Item_Identifier")["Item_Weight"].mean()
        self.overall_mean = X["Item_Weight"].mean()

        # Step 2: Store mode by (Location, Type) and by only Type
        self.group_modes = (
            X.groupby(["Outlet_Location_Type", "Outlet_Type"])["Outlet_Size"]
            .agg(lambda x: x.mode()[0] if not x.mode().empty else np.nan)
        )
        self.type_modes = (
            X.groupby(["Outlet_Type"])["Outlet_Size"]
            .agg(lambda x: x.mode()[0] if not x.mode().empty else np.nan)
        )
        return self

    def transform(self, X):
        X = X.copy()

        # --- Item_Weight Imputation ---
        X["Item_Weight"] = X.apply(
            lambda row: self.item_mean.get(row["Item_Identifier"], np.nan)
            if pd.isnull(row["Item_Weight"]) else row["Item_Weight"],
            axis=1
        )
        X["Item_Weight"] = X["Item_Weight"].fillna(self.overall_mean)

        # --- Outlet_Size Imputation ---
        def fill_outlet_size(row):
            if pd.isnull(row["Outlet_Size"]):
                val = self.group_modes.get((row["Outlet_Location_Type"], row["Outlet_Type"]), np.nan)
                if pd.isnull(val):
                    val = self.type_modes.get(row["Outlet_Type"], np.nan)
                if pd.isnull(val):
                    val = "Unknown"
                return val
            return row["Outlet_Size"]

        X["Outlet_Size"] = X.apply(fill_outlet_size, axis=1)

        # ====== Handle Item_Visibility ======
        # Replace 0 with NaN
        X["Item_Visibility"].replace(0, np.nan, inplace=True)
        
        # Fill with mean visibility per Item_Identifier
        X["Item_Visibility"] = X.groupby("Item_Identifier")["Item_Visibility"].transform(
            lambda x: x.fillna(x.mean())
        )
        
        # Still NaN? Fill with overall mean
        X["Item_Visibility"].fillna(X["Item_Visibility"].mean(), inplace=True)
        
        # Cap extreme values at 99th percentile
        q99 = X["Item_Visibility"].quantile(0.99)
        X.loc[X["Item_Visibility"] > q99, "Item_Visibility"] = q99

        # # --- Fix inconsistent categories in Item_Fat_Content ---
        X["Item_Fat_Content"] = X["Item_Fat_Content"].replace({"low fat": "LF"})
        
        # --- Final fallback for any leftover missing ---
        for col in X.columns:
            if X[col].dtype == "object":
                X[col] = X[col].fillna("Unknown")
            else:
                X[col] = X[col].fillna(0)

        return X



In [196]:
# Assuming your DataFrame is called df
mean_sales_by_fat = df.groupby("Item_Fat_Content")["Item_Outlet_Sales"].mean()

print(mean_sales_by_fat)

Item_Fat_Content
LF         2073.551928
Low Fat    2164.477336
Regular    2235.186702
low fat    2087.740737
reg        1962.192268
Name: Item_Outlet_Sales, dtype: float64


In [108]:

# Step 1: Compute mean sales per Item_Identifier
mean_sales = df.groupby("Item_Identifier")["Item_Outlet_Sales"].mean().reset_index()




In [109]:
mean_sales

Unnamed: 0,Item_Identifier,Item_Outlet_Sales
0,DRA12,1843.600200
1,DRA24,2246.218971
2,DRA59,2614.430150
3,DRB01,1518.024000
4,DRB13,2428.838400
...,...,...
1554,NCZ30,1807.647000
1555,NCZ41,1827.487840
1556,NCZ42,3839.801760
1557,NCZ53,3014.742400


In [110]:
correlation = df['Item_Visibility'].corr(df['Item_Outlet_Sales'])
print("Correlation:", correlation)


Correlation: -0.1286246122207703


In [200]:
# --------------------------
# STEP 2: Feature Engineering
# --------------------------
import numpy as np

def feature_engineering(df):
    df = df.copy()

    # New Item Type
    df['New_Item_type'] = df['Item_Identifier'].apply(lambda x: x[:2])
    df['New_Item_type'] = df['New_Item_type'].map({'FD':'FOOD','NC':'Non-Consumable','DR':'Drinks'})

    # Outlet Age
    df['Outlet_Age'] = 2013 - df['Outlet_Establishment_Year']

    # Item Age (if identifier has digits)
    df['Item_Age'] = 2013 - df['Item_Identifier'].str.extract(r'(\d+)').astype(float)

    # Visibility Ratio
    df['Visibility_Avg'] = df.groupby('Item_Identifier')['Item_Visibility'].transform('mean')
    df['Visibility_Ratio'] = df['Item_Visibility'] / df['Visibility_Avg']

    # Log transform of Item_MRP
    df['Item_MRP_Log'] = np.log1p(df['Item_MRP'])
    
    # Interaction Features
    df['Item_Weight_x_Item_Visibility'] = df['Item_Weight'] * df['Item_Visibility']
    df['Item_Weight_x_Outlet_Age'] = df['Item_Weight'] * df['Outlet_Age']
    df['MRP_x_Visibility'] = df['Item_MRP'] * df['Item_Visibility']

    # Interactions with categorical variables
    df['Visibility_x_Size'] = df['Item_Visibility'] * df['Outlet_Size'].map({'Small':1, 'Medium':2, 'High':3})
    df['Visibility_x_Tier'] = df['Item_Visibility'] * df['Outlet_Location_Type'].map({'Tier 1':1, 'Tier 2':2, 'Tier 3':3})
    df['Visibility_x_OutletType'] = df['Item_Visibility'].astype(float) * df['Outlet_Type'].astype('category').cat.codes

    return df

In [201]:
## just significat column

from sklearn.base import BaseEstimator, TransformerMixin

class ColumnKeeper(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_keep=None):
        # Default columns to keep
        if columns_to_keep is None:
            self.columns_to_keep = [
                'Item_Outlet_Sales',
                'Outlet_Age',
                'Item_Age',
                'Outlet_Size',
                'Outlet_Location_Type',
                'Outlet_Type',
                'Item_MRP',
                'Item_Fat_Content'
                
]
        else:
            self.columns_to_keep = columns_to_keep
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        # Only keep columns that exist in the dataframe
        existing_cols_to_keep = [col for col in self.columns_to_keep if col in X.columns]
        return X[existing_cols_to_keep]

In [202]:
from sklearn.base import BaseEstimator, TransformerMixin

class CategoricalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        # Predefined mappings
        self.outlet_size_map = {
            'High': 2,
            'Medium': 1,
            'Small': 0
        }
        self.outlet_type_map = {
            'Grocery Store': 0,
            'Supermarket Type1': 2,
            'Supermarket Type2': 1,
            'Supermarket Type3': 3
        }
        self.outlet_location_map = {
            'Tier 1': 0,
            'Tier 2': 2,
            'Tier 3': 1
        }
        self.item_fat_map = {
            'LF': 1,
            'Low Fat': 2,
            'Regular': 3,
            'reg': 0
        }
        

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        # Apply categorical encodings
        if 'Outlet_Size' in X.columns:
            X['Outlet_Size'] = X['Outlet_Size'].map(self.outlet_size_map).fillna(0).astype(int)

        if 'Outlet_Type' in X.columns:
            X['Outlet_Type'] = X['Outlet_Type'].map(self.outlet_type_map).fillna(0).astype(int)

        if 'Outlet_Location_Type' in X.columns:
            X['Outlet_Location_Type'] = X['Outlet_Location_Type'].map(self.outlet_location_map).fillna(0).astype(int)

        if 'Item_Fat_Content' in X.columns:
            X['Item_Fat_Content'] = X['Item_Fat_Content'].map(self.item_fat_map).fillna(0).astype(int)

        
        return X


In [203]:
pipeline = Pipeline([
    ('imputer', CustomImputer()),
    ('feature_eng', FunctionTransformer(feature_engineering)),
    ('ColumnKeeper', ColumnKeeper()),
    ('CategoricalEncoder', CategoricalEncoder())
])

In [204]:
# -----------------------------
# STEP 1: Fit the pipeline on training data
# -----------------------------
pipeline.fit(df)  # only fit on training data

In [205]:
df.columns

Index(['Item_Identifier', 'Item_Weight', 'Item_Fat_Content', 'Item_Visibility',
       'Item_Type', 'Item_MRP', 'Outlet_Identifier',
       'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type',
       'Outlet_Type', 'Item_Outlet_Sales'],
      dtype='object')

In [206]:
# -----------------------------
# STEP 2: Transform training data
# -----------------------------
df_train_transformed = pipeline.transform(df)

In [207]:
# -----------------------------
# STEP 3: Transform test data
# -----------------------------
x_test = pipeline.transform(test)


In [208]:
test.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,FDW58,20.75,Low Fat,0.007565,Snack Foods,107.8622,OUT049,1999,Medium,Tier 1,Supermarket Type1
1,FDW14,8.3,reg,0.038428,Dairy,87.3198,OUT017,2007,,Tier 2,Supermarket Type1
2,NCN55,14.6,Low Fat,0.099575,Others,241.7538,OUT010,1998,,Tier 3,Grocery Store
3,FDQ58,7.315,Low Fat,0.015388,Snack Foods,155.034,OUT017,2007,,Tier 2,Supermarket Type1
4,FDY38,,Regular,0.118599,Dairy,234.23,OUT027,1985,Medium,Tier 3,Supermarket Type3


In [209]:
df_train_transformed.head()

Unnamed: 0,Item_Outlet_Sales,Outlet_Age,Item_Age,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_MRP,Item_Fat_Content
0,8.225808,14,1998.0,1,0,2,249.8092,2
1,6.096776,4,2012.0,1,1,1,48.2692,3
2,7.648868,14,1998.0,1,0,2,141.618,2
3,6.597664,15,2006.0,0,1,0,182.095,3
4,6.903451,26,1994.0,2,1,2,53.8614,2


In [210]:
x_test.head()

Unnamed: 0,Outlet_Age,Item_Age,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_MRP,Item_Fat_Content
0,14,1955.0,1,0,2,107.8622,2
1,6,1999.0,0,2,2,87.3198,0
2,15,1958.0,0,1,0,241.7538,2
3,6,1955.0,0,2,2,155.034,2
4,28,1975.0,1,1,3,234.23,3


In [211]:
X = df_train_transformed.drop(columns=['Item_Outlet_Sales'])
y = df_train_transformed['Item_Outlet_Sales']

In [212]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [213]:
# Required imports
import time
import numpy as np
import pandas as pd
from math import log
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

# Example model imports (keep/remove as needed)
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor


class ModelLeaderboard:
    def __init__(self, cv=5):
        self.leaderboard = []
        self.models = {}
        self.cv = cv  # cv folds for r2_pred (cross-validated R2)

    def _safe_p_count(self, model, X):
        """
        Estimate number of parameters p for AIC/BIC:
        - If linear model with coef_ available: use number of non-zero coefficients + intercept.
        - Else fall back to n_features + 1 (intercept).
        This is an approximation for non-linear/ensemble models.
        """
        n_features = X.shape[1]
        try:
            if hasattr(model, "coef_"):
                coef = np.asarray(model.coef_)
                # handle multi-dim coef (e.g., multioutput) conservatively
                if coef.ndim == 1:
                    nonzero = np.count_nonzero(coef)
                else:
                    nonzero = np.count_nonzero(coef.ravel())
                p = int(nonzero) + 1  # +1 for intercept
            else:
                p = n_features + 1
        except Exception:
            p = n_features + 1
        return max(1, p)

    def _compute_aic_bic(self, y_true, y_pred, p):
        """
        Compute AIC and BIC from residual sum of squares (SSE).
        Note: This is an approximate approach for non-linear models.
        """
        y_true = np.asarray(y_true).ravel()
        y_pred = np.asarray(y_pred).ravel()
        n = y_true.shape[0]
        resid = y_true - y_pred
        sse = np.sum(resid ** 2)
        # avoid log(0)
        if sse <= 0:
            sse = 1e-12
        aic = n * np.log(sse / n) + 2 * p
        bic = n * np.log(sse / n) + np.log(n) * p
        return aic, bic

    def add_model_result(self,
                         model_name,
                         model,
                         X_train, y_train,
                         X_test, y_test,
                         train_pred, test_pred,
                         train_time):
        """
        Add full metrics for a model to the leaderboard.
        Inputs:
        - model_name: str
        - model: fitted estimator (used for p estimation & storing)
        - X_train, y_train, X_test, y_test: arrays / dataframes
        - train_pred, test_pred: model predictions (arrays)
        - train_time: float (seconds)
        """
        # Convert to arrays
        y_train_arr = np.asarray(y_train).ravel()
        y_test_arr = np.asarray(y_test).ravel()
        train_pred_arr = np.asarray(train_pred).ravel()
        test_pred_arr = np.asarray(test_pred).ravel()

        # Basic metrics
        train_rmse = float(np.sqrt(mean_squared_error(y_train_arr, train_pred_arr)))
        test_rmse = float(np.sqrt(mean_squared_error(y_test_arr, test_pred_arr)))
        train_r2 = float(r2_score(y_train_arr, train_pred_arr))
        test_r2 = float(r2_score(y_test_arr, test_pred_arr))

        # Cross-validated (predicted) R2 on training set
        try:
            cv_scores = cross_val_score(model, X_train, y_train, scoring='r2', cv=self.cv, n_jobs=-1)
            r2_pred = float(np.mean(cv_scores))
        except Exception:
            r2_pred = np.nan

        # Adjusted R2 (on training set)
        n_train = y_train_arr.shape[0]
        p = self._safe_p_count(model, np.asarray(X_train))
        # Prevent division by zero
        if n_train - p - 1 > 0:
            adj_r2 = 1 - (1 - train_r2) * (n_train - 1) / (n_train - p - 1)
        else:
            adj_r2 = np.nan

        # AIC and BIC (approximate)
        try:
            aic, bic = self._compute_aic_bic(y_train_arr, train_pred_arr, p)
        except Exception:
            aic, bic = np.nan, np.nan

        result = {
            'Model': model_name,
            'Train_RMSE': train_rmse,
            'Test_RMSE': test_rmse,
            'Train_R2': train_r2,
            'Test_R2': test_r2,
            'R2_Pred (CV)': r2_pred,
            'Adj_R2': adj_r2,
            'AIC': aic,
            'BIC': bic,
            'Overfit_Gap': test_rmse - train_rmse,
            'Train_Time': train_time
        }

        self.leaderboard.append(result)
        if model is not None:
            self.models[model_name] = model
        self.display_leaderboard()

    def display_leaderboard(self, sort_by='Test_RMSE'):
        """Display current leaderboard sorted by Test_RMSE (default)."""
        df = pd.DataFrame(self.leaderboard)
        if df.empty:
            print("Leaderboard is empty.")
            return
        if sort_by not in df.columns:
            sort_by = 'Test_RMSE'
        df_sorted = df.sort_values(sort_by).reset_index(drop=True)
        df_sorted.index = df_sorted.index + 1
        print("\n" + "=" * 90)
        print("🏆 MODEL LEADERBOARD (Sorted by {})".format(sort_by))
        print("=" * 90)
        # show with reasonable rounding
        print(df_sorted.round(4))
        print("=" * 90)

    def train_and_evaluate_models(self, X_train, X_test, y_train, y_test):
        """Train a set of common regressors and add results to leaderboard."""
        models_to_run = {
            'Linear Regression': LinearRegression(),
            'Ridge Regression': Ridge(alpha=1.0, random_state=42),
            'Lasso Regression': Lasso(alpha=0.1, random_state=42),
            'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
            'XGBoost': xgb.XGBRegressor(n_estimators=100, random_state=42, eval_metric='rmse'),
            'LightGBM': lgb.LGBMRegressor(n_estimators=100, random_state=42, verbose=-1),
            'CatBoost': CatBoostRegressor(iterations=100, random_state=42, verbose=False)
        }

        for name, model in models_to_run.items():
            print(f"🔄 Training {name}...")
            start_time = time.time()
            model.fit(X_train, y_train)
            train_time = time.time() - start_time

            train_pred = model.predict(X_train)
            test_pred = model.predict(X_test)

            self.add_model_result(
                model_name=name,
                model=model,
                X_train=X_train, y_train=y_train,
                X_test=X_test, y_test=y_test,
                train_pred=train_pred, test_pred=test_pred,
                train_time=train_time
            )

        print("\n✅ All models trained successfully!")
        return self.get_best_model()

    def get_best_model(self, sort_by='Test_RMSE'):
        """Return the best model according to sort_by metric (default Test_RMSE)."""
        if not self.leaderboard:
            return None
        best_result = min(self.leaderboard, key=lambda x: x.get(sort_by, np.inf))
        best_model_name = best_result['Model']
        print(f"\n🥇 BEST MODEL: {best_model_name}")
        print(f"{sort_by}: {best_result.get(sort_by):.4f}")
        return self.models.get(best_model_name)



In [214]:

# # -------- USAGE EXAMPLE ----------
# leaderboard = ModelLeaderboard(cv=5)
# best_model = leaderboard.train_and_evaluate_models(X_train, X_test, y_train, y_test)
# results_df = pd.DataFrame(leaderboard.leaderboard)


In [215]:
# Initialize the LGBMRegressor with your specified parameters
lgbm_model = lgb.LGBMRegressor(
    n_estimators=100, 
    random_state=42, 
    verbose=-1
)

In [216]:
# Fit the model on your training data
# Replace X_train and y_train with your actual feature matrix and target variable
print("Training the LGBMRegressor model...")
lgbm_model.fit(X,y)

Training the LGBMRegressor model...


In [217]:
# Make predictions on the test set
print("Making predictions on test data...")
y_test_pred = lgbm_model.predict(x_test)

Making predictions on test data...


In [218]:
y_test_pred.shape

(5681,)

In [219]:
submission_best = pd.DataFrame({
    'Item_Identifier': test['Item_Identifier'],
    'Outlet_Identifier': test['Outlet_Identifier'],
    'Item_Outlet_Sales': y_test_pred
})
# Save as CSV
submission_best.to_csv("submission_best.csv", index=False)

print("Submission file created successfully!")

Submission file created successfully!


In [220]:

y_pred = lgbm_model.predict(X)
residuals = y - y_pred


In [221]:
std_resid = residuals / np.std(residuals)
influential_idx = np.where(np.abs(std_resid) > 0.2)[0]  # threshold can be 2.5 or 3
influential_idx

array([   0,    1,    3, ..., 8519, 8521, 8522])

In [222]:
X_train_clean = X.drop(index=influential_idx)
y_train_clean = y.drop(index=influential_idx)
X_train_clean.head()

Unnamed: 0,Outlet_Age,Item_Age,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_MRP,Item_Fat_Content
2,14,1998.0,1,0,2,141.618,2
5,4,1977.0,1,1,1,51.4008,3
11,16,2010.0,0,0,2,144.1102,3
17,14,1964.0,1,0,2,54.3614,3
32,4,1980.0,1,1,1,256.6672,2


In [223]:
lgbm_model1 = lgb.LGBMRegressor(
    n_estimators=100, 
    random_state=42, 
    verbose=-1
)
lgbm_model_clean = lgbm_model1.fit(X_train_clean, y_train_clean)


In [224]:
y_test_pred_clean = lgbm_model_clean.predict(x_test)

In [225]:
y_test_pred_clean.shape

(5681,)

In [226]:
submission_best_clean = pd.DataFrame({
    'Item_Identifier': test['Item_Identifier'],
    'Outlet_Identifier': test['Outlet_Identifier'],
    'Item_Outlet_Sales': y_test_pred_clean
})
# Save as CSV
submission_best_clean.to_csv("submission_best_clean.csv", index=False)

print("Submission file created successfully!")

Submission file created successfully!


In [190]:
X_train, X_test, y_train, y_test = train_test_split(X_train_clean,y_train_clean, test_size=0.2, random_state=42)

In [191]:
# -------- USAGE EXAMPLE ----------
leaderboard = ModelLeaderboard(cv=10)
best_model = leaderboard.train_and_evaluate_models(X_train, X_test, y_train, y_test)
results_df = pd.DataFrame(leaderboard.leaderboard)

🔄 Training Linear Regression...

🏆 MODEL LEADERBOARD (Sorted by Test_RMSE)
               Model  Train_RMSE  Test_RMSE  Train_R2  Test_R2  R2_Pred (CV)  \
1  Linear Regression    511.0212   562.2685    0.8201   0.7951        0.8166   

   Adj_R2         AIC         BIC  Overfit_Gap  Train_Time  
1  0.8193  22367.2975  22411.2262      51.2472      0.0161  
🔄 Training Ridge Regression...

🏆 MODEL LEADERBOARD (Sorted by Test_RMSE)
               Model  Train_RMSE  Test_RMSE  Train_R2  Test_R2  R2_Pred (CV)  \
1  Linear Regression    511.0212   562.2685    0.8201   0.7951        0.8166   
2   Ridge Regression    511.0215   562.2748    0.8201   0.7951        0.8166   

   Adj_R2         AIC         BIC  Overfit_Gap  Train_Time  
1  0.8193  22367.2975  22411.2262      51.2472      0.0161  
2  0.8193  22367.2991  22411.2278      51.2533      0.0047  
🔄 Training Lasso Regression...

🏆 MODEL LEADERBOARD (Sorted by Test_RMSE)
               Model  Train_RMSE  Test_RMSE  Train_R2  Test_R2  R2_Pre

In [192]:
rf =  RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
xg =  xgb.XGBRegressor(n_estimators=100, random_state=42, eval_metric='rmse')

In [193]:
rf1 = rf.fit(X_train_clean, y_train_clean)
xg1 = xg.fit(X_train_clean, y_train_clean)


In [194]:
rf11 = rf1.predict(x_test)
xg11 = xg1.predict(x_test)

In [195]:
sub_rf = pd.DataFrame({
    'Item_Identifier': test['Item_Identifier'],
    'Outlet_Identifier': test['Outlet_Identifier'],
    'Item_Outlet_Sales': rf11
})
# Save as CSV
sub_rf.to_csv("sub_rf.csv", index=False)

print("Submission file created successfully!")



Submission file created successfully!


In [119]:
sub_xg = pd.DataFrame({
    'Item_Identifier': test['Item_Identifier'],
    'Outlet_Identifier': test['Outlet_Identifier'],
    'Item_Outlet_Sales': xg11
})
# Save as CSV
sub_xg.to_csv("sub_xg.csv", index=False)

print("Submission file created successfully!")

Submission file created successfully!


# best model.
 Model  Train_RMSE  Test_RMSE  Train_R2  Test_R2  R2_Pred (CV)  Adj_R2         AIC         BIC  Overfit_Gap  Train_Time
LightGBM    129.6930   186.0050    0.9885   0.9717   0.9719      0.9884  17538.3425  17576.8152      56.3120      0.2433  
     
 

In [124]:
lgbm_model1 = lgb.LGBMRegressor(
    n_estimators=100, 
    random_state=42, 
    verbose=-1
)
lgbm_model_clean = lgbm_model1.fit(X_train_clean, y_train_clean)


In [125]:
 from sklearn.model_selection import GridSearchCV
import lightgbm as lgb

In [141]:
lgbm_model = lgb.LGBMRegressor(
    objective='gamma',
    random_state=42,
    verbose=-1
)

In [142]:
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [3, 5, 7, -1],   # -1 means no limit
    'num_leaves': [15, 31, 63],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

In [143]:
grid_search = GridSearchCV(
    estimator=lgbm_model,
    param_grid=param_grid,
    cv=5,                # 5-fold cross-validation
    scoring='neg_mean_squared_error',  # you can also use 'r2' or 'neg_mean_absolute_error'
    n_jobs=-1,           # use all CPU cores
    verbose=2
)

grid_search.fit(X_train_clean, y_train_clean)


Fitting 5 folds for each of 432 candidates, totalling 2160 fits


# LGBMRegressor(max_depth=5, n_estimators=200, objective='gamma', random_state=42,subsample=0.8, verbose=-1)
Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'num_leaves': 31, 'subsample': 0.8}
Best Score (CV RMSE): 194.34483780684928

In [147]:
print("Best Parameters:", grid_search.best_params_)
print("Best Score (CV RMSE):", (-grid_search.best_score_)**0.5)

best_model = grid_search.best_estimator_
best_model

Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'num_leaves': 31, 'subsample': 0.8}
Best Score (CV RMSE): 194.34483780684928


In [148]:
from sklearn.metrics import mean_squared_error, r2_score

y_test_pred_best = best_model.predict(x_test)

y_test_pred_best

array([1756.77538644, 1300.14541573,  529.32694423, ..., 2110.09310445,
       3452.87534195, 1348.13617197])

In [149]:
y_test_pred_best_ = pd.DataFrame({
    'Item_Identifier': test['Item_Identifier'],
    'Outlet_Identifier': test['Outlet_Identifier'],
    'Item_Outlet_Sales': y_test_pred_best
})
# Save as CSV
y_test_pred_best_.to_csv("y_test_pred_best.csv", index=False)

print("Submission file created successfully!")

Submission file created successfully!


# Best Model Results

| Model    | Train_RMSE | Test_RMSE | Train_R² | Test_R² | R²_Pred (CV) | Adj_R² |     AIC     |     BIC     | Overfit_Gap | Train_Time |
|----------|------------|-----------|----------|---------|--------------|--------|-------------|-------------|-------------|------------|
| LightGBM | 129.6930   | 186.0050  | 0.9885   | 0.9717  | 0.9719       | 0.9884 | 17538.3425  | 17576.8152  | 56.3120     | 0.2433     |


In [151]:
lgbm_model1 = lgb.LGBMRegressor(
    n_estimators=100, 
    random_state=42, 
    verbose=-1
)
lgbm_model_clean = lgbm_model1.fit(X_train_clean, y_train_clean)


In [21]:
submission_best_clean = pd.DataFrame({
    'Item_Identifier': test['Item_Identifier'],
    'Outlet_Identifier': test['Outlet_Identifier'],
    'Item_Outlet_Sales': y_test_pred_clean
})
# Save as CSV
submission_best_clean.to_csv("submission_best_clean.csv", index=False)

print("Submission file created successfully!")

NameError: name 'y_test_pred_clean' is not defined

In [155]:
import pickle

# Save to file
with open("lgbm_model1.pkl", "wb") as f:
    pickle.dump(lgbm_model1, f)



# # Load from file
# with open("lgbm_model1.pkl", "rb") as f:
#     loaded_model = pickle.load(f)

# # Test prediction
# y_pred_loaded = loaded_model.predict(x_test)
