In [5]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import optuna
from optuna.samplers import TPESampler
import warnings
warnings.filterwarnings('ignore')

# Load the datasets
train_df = pd.read_csv('train_v9rqX0R.csv')  # Replace with actual train file path
train_df = train_df[~train_df['Item_Identifier'].isin([
    'FDX20', 'FDG33', 'FDW13', 'FDG24', 'DRE49', 'NCY18',
    'FDO19', 'FDL34', 'FDO52', 'NCL31', 'FDA04', 'NCQ06',
    'FDT07', 'FDL10', 'FDX04', 'FDU19'])]
test_df = pd.read_csv('test_AbJTz2l.csv')    # Replace with actual test file path

# Combine train and test for preprocessing
train_df['source'] = 'train'
test_df['source'] = 'test'
combined_df = pd.concat([train_df, test_df], ignore_index=True)

# Data Preprocessing
combined_df['Item_Weight'] = combined_df.groupby('Item_Type')['Item_Weight'].transform(lambda x: x.fillna(x.mean()))
combined_df['Outlet_Size'] = combined_df.groupby('Outlet_Type')['Outlet_Size'].transform(
    lambda x: x.fillna(x.mode()[0] if not x.mode().empty else 'Medium'))
item_visibility_mean = combined_df[combined_df['Item_Visibility'] > 0].groupby('Item_Identifier')['Item_Visibility'].mean()
combined_df['Item_Visibility'] = combined_df.apply(
    lambda row: item_visibility_mean[row['Item_Identifier']] if row['Item_Visibility'] == 0 else row['Item_Visibility'],
    axis=1
)
combined_df['Item_Visibility'].fillna(combined_df['Item_Visibility'].mean(), inplace=True)

# Feature Engineering
combined_df['Outlet_Age'] = 2013 - combined_df['Outlet_Establishment_Year']
combined_df['Item_Fat_Content'] = combined_df['Item_Fat_Content'].replace({'LF': 'Low Fat', 'reg': 'Regular', 'low fat': 'Low Fat'})

# Encoding categorical variables
cat_cols = ['Item_Fat_Content', 'Item_Type', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']
for col in cat_cols:
    combined_df = pd.concat([combined_df, pd.get_dummies(combined_df[col], prefix=col)], axis=1)
    combined_df.drop(col, axis=1, inplace=True)

# Drop unnecessary columns
combined_df.drop(['Item_Identifier', 'Outlet_Identifier', 'Outlet_Establishment_Year'], axis=1, inplace=True)

# Split back into train and test
train = combined_df[combined_df['source'] == 'train'].drop('source', axis=1)
test = combined_df[combined_df['source'] == 'test'].drop(['source', 'Item_Outlet_Sales'], axis=1)

# Prepare features and target
X = train.drop('Item_Outlet_Sales', axis=1)
y = train['Item_Outlet_Sales']
X_test = test

# Scale numerical columns with RobustScaler
num_cols = ['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Outlet_Age']  # Only scale numerical columns
scaler = RobustScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

# Define the base model for RFE
base_model = ExtraTreesRegressor(n_estimators=472, max_depth=8, min_samples_leaf=40, n_jobs=-1, random_state=42)

# Set number of features to keep (half of total features)
n_features = X.shape[1] // 2

# Set up RFE
selector = RFE(estimator=base_model, n_features_to_select=n_features, step=1)

# Fit and transform your data
X_selected = selector.fit_transform(X, y)
X_test_selected = selector.transform(X_test)

# Get selected feature names
selected_features = X.columns[selector.get_support()].tolist()
print(f"Number of features selected: {len(selected_features)}")
print(f"Selected features: {selected_features}")

# Split train data for validation
X_train, X_val, y_train, y_val = train_test_split(X_selected, y, test_size=0.2, random_state=42)


# Define objective function for Optuna
def objective(trial):
    model_name = trial.suggest_categorical("model", [
        "RandomForest", "XGBoost", "LightGBM", "CatBoost", "ExtraTrees", "GradientBoosting"
    ])
    
    if model_name == "RandomForest":
        params = {
            'n_estimators': trial.suggest_int('rf_n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('rf_max_depth', 3, 15),
            'min_samples_leaf': trial.suggest_int('rf_min_samples_leaf', 10, 100),
            'n_jobs': -1,
            'random_state': 42
        }
        model = RandomForestRegressor(**params)
    
    elif model_name == "XGBoost":
        params = {
            'n_estimators': trial.suggest_int('xgb_n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('xgb_max_depth', 3, 15),
            'learning_rate': trial.suggest_float('xgb_learning_rate', 0.01, 0.3),
            'reg_alpha': trial.suggest_float('xgb_reg_alpha', 0, 1),
            'reg_lambda': trial.suggest_float('xgb_reg_lambda', 0, 1),
            'random_state': 42
        }
        model = XGBRegressor(**params)
    
    elif model_name == "LightGBM":
        params = {
            'n_estimators': trial.suggest_int('lgb_n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('lgb_max_depth', 3, 15),
            'learning_rate': trial.suggest_float('lgb_learning_rate', 0.01, 0.3),
            'reg_alpha': trial.suggest_float('lgb_reg_alpha', 0, 1),
            'reg_lambda': trial.suggest_float('lgb_reg_lambda', 0, 1),
            'random_state': 42,
            'verbose': -1
        }
        model = LGBMRegressor(**params)
    
    elif model_name == "CatBoost":
        params = {
            'iterations': trial.suggest_int('cat_iterations', 100, 1000),
            'depth': trial.suggest_int('cat_depth', 3, 10),
            'learning_rate': trial.suggest_float('cat_learning_rate', 0.01, 0.3),
            'l2_leaf_reg': trial.suggest_float('cat_l2_leaf_reg', 1, 10),
            'random_seed': 42,
            'verbose': 0
        }
        model = CatBoostRegressor(**params)
    
    elif model_name == "ExtraTrees":
        params = {
            'n_estimators': trial.suggest_int('et_n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('et_max_depth', 3, 15),
            'min_samples_leaf': trial.suggest_int('et_min_samples_leaf', 10, 100),
            'n_jobs': -1,
            'random_state': 42
        }
        model = ExtraTreesRegressor(**params)
    
    elif model_name == "GradientBoosting":
        params = {
            'n_estimators': trial.suggest_int('gb_n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('gb_max_depth', 3, 15),
            'learning_rate': trial.suggest_float('gb_learning_rate', 0.01, 0.3),
            'min_samples_leaf': trial.suggest_int('gb_min_samples_leaf', 10, 100),
            'random_state': 42
        }
        model = GradientBoostingRegressor(**params)

    # Train and evaluate model with cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    rmse_scores = []
    for train_idx, val_idx in kf.split(X):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        rmse_scores.append(rmse)
    
    trial.set_user_attr('params', {k: v for k, v in trial.params.items() if k != 'model'})
    return np.mean(rmse_scores)

# Optimize with Optuna using TPESampler
sampler = TPESampler(seed=42)
study = optuna.create_study(direction='minimize', sampler=sampler)
study.optimize(objective, n_trials=50)  # Matches your run

# Get top 3 models from trials
trials = sorted(study.trials, key=lambda t: t.value if t.value is not None else float('inf'))
top_3_trials = trials[:3]
print("Top 3 models and their RMSE scores:")
for trial in top_3_trials:
    print(f"Model: {trial.params['model']}, RMSE: {trial.value}, Params: {trial.user_attrs['params']}")

# Initialize top 3 models with best parameters
models = []
for trial in top_3_trials:
    model_name = trial.params['model']
    # Remove model-specific prefix from parameter names
    params = {k.split('_', 1)[1] if '_' in k else k: v for k, v in trial.params.items() if k != 'model'}
    
    if model_name == "RandomForest":
        model = RandomForestRegressor(**params, n_jobs=-1, random_state=42)
    elif model_name == "XGBoost":
        model = XGBRegressor(**params, random_state=42)
    elif model_name == "LightGBM":
        model = LGBMRegressor(**params, random_state=42, verbose=-1)
    elif model_name == "CatBoost":
        # Rename 'iterations' to match CatBoost API
        if 'iterations' in params:
            params['iterations'] = params.pop('iterations')
        model = CatBoostRegressor(**params, random_seed=42, verbose=0)
    elif model_name == "ExtraTrees":
        model = ExtraTreesRegressor(**params, n_jobs=-1, random_state=42)
    elif model_name == "GradientBoosting":
        model = GradientBoostingRegressor(**params, random_state=42)
    
    models.append((model_name, model))

# Stacking with top 3 models
kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = {f"{name}_{i}": np.zeros(len(X_selected)) for i, (name, _) in enumerate(models)}
test_preds = {f"{name}_{i}": np.zeros(len(X_test_selected)) for i, (name, _) in enumerate(models)}

for train_idx, val_idx in kf.split(X_selected):
    X_tr, X_val = X_selected[train_idx], X_selected[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    for i, (model_name, model) in enumerate(models):
        model.fit(X_tr, y_tr)
        oof_preds[f"{model_name}_{i}"][val_idx] = model.predict(X_val)
        test_preds[f"{model_name}_{i}"] += model.predict(X_test_selected) / kf.n_splits

# Meta-model (Ridge Regression)
meta_features = np.column_stack([oof_preds[key] for key in oof_preds])
meta_model = Ridge(alpha=1.0)
meta_model.fit(meta_features, y)

# Evaluate on validation set
# Using the last fold's X_val for simplicity; for full OOF evaluation, adjust accordingly
meta_val_preds = meta_model.predict(np.column_stack([model.predict(X_val) for _, model in models]))
rmse = np.sqrt(mean_squared_error(y_val, meta_val_preds))
print(f"Validation RMSE with stacking of top 3 models: {rmse}")

# Final test predictions
meta_test_preds = meta_model.predict(np.column_stack([test_preds[key] for key in test_preds]))

# Prepare submission
submission = pd.DataFrame({
    'Item_Identifier': test_df['Item_Identifier'],
    'Outlet_Identifier': test_df['Outlet_Identifier'],
    'Item_Outlet_Sales': meta_test_preds
})
submission.to_csv('submission.csv', index=False)
print("Submission file created: submission.csv")

[I 2025-02-23 17:18:19,486] A new study created in memory with name: no-name-466a23e1-ba77-4fdb-b6d6-96e6a0f9bae7


Number of features selected: 7
Selected features: ['Item_MRP', 'Item_Fat_Content_Low Fat', 'Item_Type_Fruits and Vegetables', 'Item_Type_Seafood', 'Outlet_Type_Grocery Store', 'Outlet_Type_Supermarket Type2', 'Outlet_Type_Supermarket Type3']


[I 2025-02-23 17:18:24,118] Trial 0 finished with value: 1263.7668630241294 and parameters: {'model': 'XGBoost', 'xgb_n_estimators': 152, 'xgb_max_depth': 14, 'xgb_learning_rate': 0.18432335340553055, 'xgb_reg_alpha': 0.7080725777960455, 'xgb_reg_lambda': 0.020584494295802447}. Best is trial 0 with value: 1263.7668630241294.
[I 2025-02-23 17:18:28,876] Trial 1 finished with value: 1079.0800961535354 and parameters: {'model': 'RandomForest', 'rf_n_estimators': 572, 'rf_max_depth': 8, 'rf_min_samples_leaf': 36}. Best is trial 1 with value: 1079.0800961535354.
[I 2025-02-23 17:18:56,090] Trial 2 finished with value: 1204.0546481710796 and parameters: {'model': 'GradientBoosting', 'gb_n_estimators': 279, 'gb_max_depth': 9, 'gb_learning_rate': 0.18180022496999232, 'gb_min_samples_leaf': 14}. Best is trial 1 with value: 1079.0800961535354.
[I 2025-02-23 17:18:57,330] Trial 3 finished with value: 1092.673847557283 and parameters: {'model': 'ExtraTrees', 'et_n_estimators': 374, 'et_max_depth':

Top 3 models and their RMSE scores:
Model: ExtraTrees, RMSE: 1074.6657087616481, Params: {'et_n_estimators': 472, 'et_max_depth': 8, 'et_min_samples_leaf': 40}
Model: ExtraTrees, RMSE: 1074.6732943203172, Params: {'et_n_estimators': 815, 'et_max_depth': 8, 'et_min_samples_leaf': 40}
Model: ExtraTrees, RMSE: 1074.6850525248644, Params: {'et_n_estimators': 528, 'et_max_depth': 8, 'et_min_samples_leaf': 41}
Validation RMSE with stacking of top 3 models: 1073.4615374250563
Submission file created: submission.csv


In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import RFE
from sklearn.preprocessing import RobustScaler
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import optuna
from optuna.samplers import TPESampler
import warnings
warnings.filterwarnings('ignore')

# Load the datasets
train_df = pd.read_csv('train_v9rqX0R.csv')  # Replace with actual train file path
train_df = train_df[~train_df['Item_Identifier'].isin([
    'FDX20', 'FDG33', 'FDW13', 'FDG24', 'DRE49', 'NCY18',
    'FDO19', 'FDL34', 'FDO52', 'NCL31', 'FDA04', 'NCQ06',
    'FDT07', 'FDL10', 'FDX04', 'FDU19'])]
test_df = pd.read_csv('test_AbJTz2l.csv')    # Replace with actual test file path

# Combine train and test for preprocessing
train_df['source'] = 'train'
test_df['source'] = 'test'
combined_df = pd.concat([train_df, test_df], ignore_index=True)

# Data Preprocessing
combined_df['Item_Weight'] = combined_df.groupby('Item_Type')['Item_Weight'].transform(lambda x: x.fillna(x.mean()))
combined_df['Outlet_Size'] = combined_df.groupby('Outlet_Type')['Outlet_Size'].transform(
    lambda x: x.fillna(x.mode()[0] if not x.mode().empty else 'Medium'))
item_visibility_mean = combined_df[combined_df['Item_Visibility'] > 0].groupby('Item_Identifier')['Item_Visibility'].mean()
combined_df['Item_Visibility'] = combined_df.apply(
    lambda row: item_visibility_mean[row['Item_Identifier']] if row['Item_Visibility'] == 0 else row['Item_Visibility'],
    axis=1
)
combined_df['Item_Visibility'].fillna(combined_df['Item_Visibility'].mean(), inplace=True)

# Feature Engineering
combined_df['Outlet_Age'] = 2025 - combined_df['Outlet_Establishment_Year']
combined_df['Item_Fat_Content'] = combined_df['Item_Fat_Content'].replace({'LF': 'Low Fat', 'reg': 'Regular', 'low fat': 'Low Fat'})

# Encoding categorical variables
cat_cols = ['Item_Fat_Content', 'Item_Type', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']
for col in cat_cols:
    combined_df = pd.concat([combined_df, pd.get_dummies(combined_df[col], prefix=col)], axis=1)
    combined_df.drop(col, axis=1, inplace=True)

# Drop unnecessary columns
combined_df.drop(['Item_Identifier', 'Outlet_Identifier', 'Outlet_Establishment_Year'], axis=1, inplace=True)

# Split back into train and test
train = combined_df[combined_df['source'] == 'train'].drop('source', axis=1)
test = combined_df[combined_df['source'] == 'test'].drop(['source', 'Item_Outlet_Sales'], axis=1)

# Prepare features and target
X = train.drop('Item_Outlet_Sales', axis=1)
y = train['Item_Outlet_Sales']
X_test = test

# Scale numerical columns with RobustScaler
num_cols = ['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Outlet_Age']
scaler = RobustScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

# Define the base model for RFE
base_model = ExtraTreesRegressor(n_estimators=472, max_depth=8, min_samples_leaf=40, n_jobs=-1, random_state=42)

# Set number of features to keep (half of total features)
n_features = X.shape[1] // 2

# Set up RFE
selector = RFE(estimator=base_model, n_features_to_select=n_features, step=1)

# Fit and transform your data
X_selected = selector.fit_transform(X, y)
X_test_selected = selector.transform(X_test)

# Get selected feature names
selected_features = X.columns[selector.get_support()].tolist()
print(f"Number of features selected: {len(selected_features)}")
print(f"Selected features: {selected_features}")

# Split train data for validation
X_train, X_val, y_train, y_val = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Define objective function for Optuna
def objective(trial):
    model_name = trial.suggest_categorical("model", [
        "RandomForest", "XGBoost", "LightGBM", "CatBoost", "ExtraTrees", "GradientBoosting"
    ])
    
    if model_name == "RandomForest":
        params = {
            'n_estimators': trial.suggest_int('rf_n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('rf_max_depth', 3, 15),
            'min_samples_leaf': trial.suggest_int('rf_min_samples_leaf', 10, 100),
            'n_jobs': -1,
            'random_state': 42
        }
        model = RandomForestRegressor(**params)
    
    elif model_name == "XGBoost":
        params = {
            'n_estimators': trial.suggest_int('xgb_n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('xgb_max_depth', 3, 15),
            'learning_rate': trial.suggest_float('xgb_learning_rate', 0.01, 0.3),
            'reg_alpha': trial.suggest_float('xgb_reg_alpha', 0, 1),
            'reg_lambda': trial.suggest_float('xgb_reg_lambda', 0, 1),
            'random_state': 42
        }
        model = XGBRegressor(**params)
    
    elif model_name == "LightGBM":
        params = {
            'n_estimators': trial.suggest_int('lgb_n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('lgb_max_depth', 3, 15),
            'learning_rate': trial.suggest_float('lgb_learning_rate', 0.01, 0.3),
            'reg_alpha': trial.suggest_float('lgb_reg_alpha', 0, 1),
            'reg_lambda': trial.suggest_float('lgb_reg_lambda', 0, 1),
            'random_state': 42,
            'verbose': -1
        }
        model = LGBMRegressor(**params)
    
    elif model_name == "CatBoost":
        params = {
            'iterations': trial.suggest_int('cat_iterations', 100, 1000),
            'depth': trial.suggest_int('cat_depth', 3, 10),
            'learning_rate': trial.suggest_float('cat_learning_rate', 0.01, 0.3),
            'l2_leaf_reg': trial.suggest_float('cat_l2_leaf_reg', 1, 10),
            'random_seed': 42,
            'verbose': 0
        }
        model = CatBoostRegressor(**params)
    
    elif model_name == "ExtraTrees":
        params = {
            'n_estimators': trial.suggest_int('et_n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('et_max_depth', 3, 15),
            'min_samples_leaf': trial.suggest_int('et_min_samples_leaf', 10, 100),
            'n_jobs': -1,
            'random_state': 42
        }
        model = ExtraTreesRegressor(**params)
    
    elif model_name == "GradientBoosting":
        params = {
            'n_estimators': trial.suggest_int('gb_n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('gb_max_depth', 3, 15),
            'learning_rate': trial.suggest_float('gb_learning_rate', 0.01, 0.3),
            'min_samples_leaf': trial.suggest_int('gb_min_samples_leaf', 10, 100),
            'random_state': 42
        }
        model = GradientBoostingRegressor(**params)

    # Train and evaluate model with cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    rmse_scores = []
    for train_idx, val_idx in kf.split(X):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        rmse_scores.append(rmse)
    
    trial.set_user_attr('params', {k: v for k, v in trial.params.items() if k != 'model'})
    return np.mean(rmse_scores)

# Optimize with Optuna using TPESampler
sampler = TPESampler(seed=42)
study = optuna.create_study(direction='minimize', sampler=sampler)
study.optimize(objective, n_trials=50)

# Get top trials and identify best model per algorithm
trials = sorted(study.trials, key=lambda t: t.value if t.value is not None else float('inf'))
algorithm_best_trials = {}
for trial in trials:
    model_name = trial.params['model']
    if model_name not in algorithm_best_trials:  # Keep only the first (best) trial for each algorithm
        algorithm_best_trials[model_name] = trial
    if len(algorithm_best_trials) == 3:  # Stop after getting top 3 algorithms
        break

# Initialize models with best parameters from top 3 algorithms
models = []
for model_name, trial in algorithm_best_trials.items():
    # Remove model-specific prefix from parameter names
    params = {k.split('_', 1)[1] if '_' in k else k: v for k, v in trial.params.items() if k != 'model'}
    
    if model_name == "RandomForest":
        model = RandomForestRegressor(**params, n_jobs=-1, random_state=42)
    elif model_name == "XGBoost":
        model = XGBRegressor(**params, random_state=42)
    elif model_name == "LightGBM":
        model = LGBMRegressor(**params, random_state=42, verbose=-1)
    elif model_name == "CatBoost":
        if 'iterations' in params:
            params['iterations'] = params.pop('iterations')
        model = CatBoostRegressor(**params, random_seed=42, verbose=0)
    elif model_name == "ExtraTrees":
        model = ExtraTreesRegressor(**params, n_jobs=-1, random_state=42)
    elif model_name == "GradientBoosting":
        model = GradientBoostingRegressor(**params, random_state=42)
    
    models.append((model_name, model))

print("Top models from 3 distinct algorithms:")
for model_name, trial in algorithm_best_trials.items():
    print(f"Model: {model_name}, RMSE: {trial.value}, Params: {trial.user_attrs['params']}")

# Stacking with top models from top 3 algorithms
kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = {f"{name}_{i}": np.zeros(len(X_selected)) for i, (name, _) in enumerate(models)}
test_preds = {f"{name}_{i}": np.zeros(len(X_test_selected)) for i, (name, _) in enumerate(models)}

for train_idx, val_idx in kf.split(X_selected):
    X_tr, X_val = X_selected[train_idx], X_selected[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    for i, (model_name, model) in enumerate(models):
        model.fit(X_tr, y_tr)
        oof_preds[f"{model_name}_{i}"][val_idx] = model.predict(X_val)
        test_preds[f"{model_name}_{i}"] += model.predict(X_test_selected) / kf.n_splits

# Meta-model (Ridge Regression)
meta_features = np.column_stack([oof_preds[key] for key in oof_preds])
meta_model = Ridge(alpha=1.0)
meta_model.fit(meta_features, y)

# Evaluate on validation set
meta_val_preds = meta_model.predict(np.column_stack([model.predict(X_val) for _, model in models]))
rmse = np.sqrt(mean_squared_error(y_val, meta_val_preds))
print(f"Validation RMSE with stacking of top models from top 3 algorithms: {rmse}")

# Final test predictions
meta_test_preds = meta_model.predict(np.column_stack([test_preds[key] for key in test_preds]))

# Prepare submission
submission = pd.DataFrame({
    'Item_Identifier': test_df['Item_Identifier'],
    'Outlet_Identifier': test_df['Outlet_Identifier'],
    'Item_Outlet_Sales': meta_test_preds
})
submission.to_csv('submission.csv', index=False)
print("Submission file created: submission.csv")

[I 2025-02-23 16:45:21,149] A new study created in memory with name: no-name-371f0e81-9f87-424a-acd4-cdff3446150c


Number of features selected: 16
Selected features: ['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Outlet_Age', 'Item_Fat_Content_Low Fat', 'Item_Fat_Content_Regular', 'Item_Type_Fruits and Vegetables', 'Item_Type_Household', 'Outlet_Size_Small', 'Outlet_Location_Type_Tier 1', 'Outlet_Location_Type_Tier 2', 'Outlet_Location_Type_Tier 3', 'Outlet_Type_Grocery Store', 'Outlet_Type_Supermarket Type1', 'Outlet_Type_Supermarket Type2', 'Outlet_Type_Supermarket Type3']


[I 2025-02-23 16:45:25,385] Trial 0 finished with value: 1263.7668630241294 and parameters: {'model': 'XGBoost', 'xgb_n_estimators': 152, 'xgb_max_depth': 14, 'xgb_learning_rate': 0.18432335340553055, 'xgb_reg_alpha': 0.7080725777960455, 'xgb_reg_lambda': 0.020584494295802447}. Best is trial 0 with value: 1263.7668630241294.
[I 2025-02-23 16:45:30,213] Trial 1 finished with value: 1079.0800961535354 and parameters: {'model': 'RandomForest', 'rf_n_estimators': 572, 'rf_max_depth': 8, 'rf_min_samples_leaf': 36}. Best is trial 1 with value: 1079.0800961535354.
[I 2025-02-23 16:45:57,354] Trial 2 finished with value: 1204.0546481710796 and parameters: {'model': 'GradientBoosting', 'gb_n_estimators': 279, 'gb_max_depth': 9, 'gb_learning_rate': 0.18180022496999232, 'gb_min_samples_leaf': 14}. Best is trial 1 with value: 1079.0800961535354.
[I 2025-02-23 16:45:58,580] Trial 3 finished with value: 1092.673847557283 and parameters: {'model': 'ExtraTrees', 'et_n_estimators': 374, 'et_max_depth':

Top models from 3 distinct algorithms:
Model: ExtraTrees, RMSE: 1074.6657087616484, Params: {'et_n_estimators': 472, 'et_max_depth': 8, 'et_min_samples_leaf': 40}
Model: CatBoost, RMSE: 1078.8584342536942, Params: {'cat_iterations': 120, 'cat_depth': 10, 'cat_learning_rate': 0.03355497508874883, 'cat_l2_leaf_reg': 9.742304653893186}
Model: RandomForest, RMSE: 1079.0800961535354, Params: {'rf_n_estimators': 572, 'rf_max_depth': 8, 'rf_min_samples_leaf': 36}
Validation RMSE with stacking of top models from top 3 algorithms: 1069.1012497284676
Submission file created: submission.csv


In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import RobustScaler
import warnings
warnings.filterwarnings('ignore')

# Load the datasets
train_df = pd.read_csv('train_v9rqX0R.csv')  # Replace with actual train file path
train_df = train_df[~train_df['Item_Identifier'].isin([
    'FDX20', 'FDG33', 'FDW13', 'FDG24', 'DRE49', 'NCY18',
    'FDO19', 'FDL34', 'FDO52', 'NCL31', 'FDA04', 'NCQ06',
    'FDT07', 'FDL10', 'FDX04', 'FDU19'])]
test_df = pd.read_csv('test_AbJTz2l.csv')    # Replace with actual test file path

# Combine train and test for preprocessing
train_df['source'] = 'train'
test_df['source'] = 'test'
combined_df = pd.concat([train_df, test_df], ignore_index=True)

# Data Preprocessing
combined_df['Item_Weight'] = combined_df.groupby('Item_Type')['Item_Weight'].transform(lambda x: x.fillna(x.mean()))
combined_df['Outlet_Size'] = combined_df.groupby('Outlet_Type')['Outlet_Size'].transform(
    lambda x: x.fillna(x.mode()[0] if not x.mode().empty else 'Medium'))
item_visibility_mean = combined_df[combined_df['Item_Visibility'] > 0].groupby('Item_Identifier')['Item_Visibility'].mean()
combined_df['Item_Visibility'] = combined_df.apply(
    lambda row: item_visibility_mean[row['Item_Identifier']] if row['Item_Visibility'] == 0 else row['Item_Visibility'],
    axis=1
)
combined_df['Item_Visibility'].fillna(combined_df['Item_Visibility'].mean(), inplace=True)

# Feature Engineering
combined_df['Outlet_Age'] = 2025 - combined_df['Outlet_Establishment_Year']
combined_df['Item_Fat_Content'] = combined_df['Item_Fat_Content'].replace({'LF': 'Low Fat', 'reg': 'Regular', 'low fat': 'Low Fat'})

# Encoding categorical variables (only for selected features that need encoding)
combined_df = pd.concat([combined_df, pd.get_dummies(combined_df['Item_Fat_Content'], prefix='Item_Fat_Content')], axis=1)
combined_df = pd.concat([combined_df, pd.get_dummies(combined_df['Item_Type'], prefix='Item_Type')], axis=1)
combined_df = pd.concat([combined_df, pd.get_dummies(combined_df['Outlet_Type'], prefix='Outlet_Type')], axis=1)

# Drop unnecessary columns
combined_df.drop(['Item_Identifier', 'Outlet_Identifier', 'Outlet_Establishment_Year'], axis=1, inplace=True)

# Split back into train and test
train = combined_df[combined_df['source'] == 'train'].drop('source', axis=1)
test = combined_df[combined_df['source'] == 'test'].drop(['source', 'Item_Outlet_Sales'], axis=1)

# Selected features
selected_features = [
    'Item_MRP', 
    'Item_Fat_Content_Low Fat', 
    'Item_Type_Fruits and Vegetables', 
    'Item_Type_Seafood', 
    'Outlet_Type_Grocery Store', 
    'Outlet_Type_Supermarket Type2', 
    'Outlet_Type_Supermarket Type3'
]

# Prepare features and target
X = train[selected_features]
y = train['Item_Outlet_Sales']
X_test = test[selected_features]

# Scale numerical columns with RobustScaler (only Item_MRP is numerical among selected features)
scaler = RobustScaler()
X[['Item_MRP']] = scaler.fit_transform(X[['Item_MRP']])
X_test[['Item_MRP']] = scaler.transform(X_test[['Item_MRP']])

# Initialize ExtraTreesRegressor
model = ExtraTreesRegressor(
    n_estimators=500,
    max_depth=10,
    min_samples_leaf=20,
    n_jobs=-1,
    random_state=42
)

# 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))
rmse_scores = []

for train_idx, val_idx in kf.split(X):
    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Train the model
    model.fit(X_tr, y_tr)
    
    # Predict on validation set
    val_preds = model.predict(X_val)
    oof_preds[val_idx] = val_preds
    
    # Calculate RMSE for this fold
    rmse = np.sqrt(mean_squared_error(y_val, val_preds))
    rmse_scores.append(rmse)
    
    # Predict on test set (average across folds)
    test_preds += model.predict(X_test) / kf.n_splits

# Print results
print(f"5-Fold Cross-Validation RMSE Scores: {rmse_scores}")
print(f"Average RMSE: {np.mean(rmse_scores)}")
print(f"Standard Deviation of RMSE: {np.std(rmse_scores)}")

# Prepare submission
submission = pd.DataFrame({
    'Item_Identifier': test_df['Item_Identifier'],
    'Outlet_Identifier': test_df['Outlet_Identifier'],
    'Item_Outlet_Sales': test_preds
})
submission.to_csv('submission_extra_trees.csv', index=False)
print("Submission file created: submission_extra_trees.csv")

5-Fold Cross-Validation RMSE Scores: [1117.0704148038217, 1075.8356166754372, 1049.429781718685, 1069.085988629873, 1068.4921425557623]
Average RMSE: 1075.9827888767159
Standard Deviation of RMSE: 22.347046735832947
Submission file created: submission_extra_trees.csv
