# Libraries

In [90]:
# Importing necessary libraries
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [88]:
!pip install catboost
!pip install xgboost
!pip install lightgbm
!pip install openpyxl



In [89]:
# Machine Learning Models
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from xgboost import XGBRegressor
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from sklearn.ensemble import VotingRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import AdaBoostRegressor

# Data Loading

In [91]:
import numpy as np
import pandas as pd

# Loading the CSV files instead of Excel files
df = pd.read_csv('clustered_data_new.csv')

# Display the first few rows to check the structure before merging
df.head()

Unnamed: 0,Store_Number,Perc_Eastern_Europeans,Gender Ratio,Temperature,Item_Code,Item_Name,Retail_Price,Count_Week_Instock,Normalized_Sales_$L52W,Sales Bucket,...,Package_Type_1L,Package_Type_200-3gft,Package_Type_200ml,Package_Type_375ml,Package_Type_700ml,Package_Type_720ml,Package_Type_750gft,Package_Type_750ml,Count_Week_Instock_Normalized,Cluster_Label
0,2802,0.17,98.24,57.5,503010,Barton Vodka,6.99,52,4661.0,Sales included,...,1,0,0,0,0,0,0,0,1.0,3
1,2705,0.17,97.16,55.2,503010,Barton Vodka,6.49,52,100.0,Sales included,...,1,0,0,0,0,0,0,0,1.0,3
2,2801,0.17,98.24,57.5,503010,Barton Vodka,7.49,39,,Sales excluded for model test,...,1,0,0,0,0,0,0,0,0.75,3
3,802,0.19,94.33,65.4,503175,Barton Vodka,12.99,52,4689.0,Sales included,...,0,0,0,0,0,0,0,0,1.0,3
4,2201,0.76,94.38,49.1,503175,Barton Vodka,13.99,52,3926.0,Sales included,...,0,0,0,0,0,0,0,0,1.0,3


In [92]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60973 entries, 0 to 60972
Data columns (total 100 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Store_Number                        60973 non-null  int64  
 1   Perc_Eastern_Europeans              60973 non-null  float64
 2   Gender Ratio                        60973 non-null  float64
 3   Temperature                         60973 non-null  float64
 4   Item_Code                           60973 non-null  int64  
 5   Item_Name                           60973 non-null  object 
 6   Retail_Price                        60973 non-null  float64
 7   Count_Week_Instock                  60973 non-null  int64  
 8   Normalized_Sales_$L52W              54894 non-null  float64
 9   Sales Bucket                        60973 non-null  object 
 10  Store_Name                          60973 non-null  object 
 11  Open_Date                           6097

In [93]:
df.columns

Index(['Store_Number', 'Perc_Eastern_Europeans', 'Gender Ratio', 'Temperature',
       'Item_Code', 'Item_Name', 'Retail_Price', 'Count_Week_Instock',
       'Normalized_Sales_$L52W', 'Sales Bucket', 'Store_Name', 'Open_Date',
       'Market_Name', 'Store_Address', 'Households',
       'Perc_HH_Income_above100K', 'Median_HH_Income', 'Average_Net_Worth',
       'Perc_Population_Bachelor_Degree', 'Perc_Hispanic', 'Perc_Asian',
       'Perc_African_American', 'Perc_Population_Age_50-70', 'US Whiskey',
       'Tequila Under $65', 'Tequila Over $65', 'Scotch Under $75',
       'Scotch Over $75', 'Vodka', 'Cordials', 'Brandy Under $85',
       'Brandy Over $85', 'Cabernet Under $20', 'Cabernet $20-50',
       'Cabernet Over $50', 'Chardonnay Under $20', 'Chardonnay Over $20',
       'Wine - Sparkling', 'Pinot Noir Under $20', 'Pinot Noir Over $20',
       'Sauvignon Blanc', 'French Champagne', 'Market_Sales_L52wk',
       'Count_Item_Location', 'Store_Age_Days', 'High_Education_High_Income',

In [94]:
print(df.isna().sum())

Store_Number                     0
Perc_Eastern_Europeans           0
Gender Ratio                     0
Temperature                      0
Item_Code                        0
                                ..
Package_Type_720ml               0
Package_Type_750gft              0
Package_Type_750ml               0
Count_Week_Instock_Normalized    0
Cluster_Label                    0
Length: 100, dtype: int64


# Data Prep

In [95]:
sales_included = df[df['Sales Bucket'] == 'Sales included']
sales_excluded = df[df['Sales Bucket'] == 'Sales excluded for model test']

In [96]:
print(sales_included.shape[0])
print(sales_excluded.shape[0])
print(df['Sales Bucket'].value_counts())

54894
6079
Sales Bucket
Sales included                   54894
Sales excluded for model test     6079
Name: count, dtype: int64


In [97]:
# Filling NAs
def fill_NAs(df):
  df_cleaned = df.copy()

  # Group by 'Item Code' and 'Price Zone' and calculate the average 'L52W in Stock'
  average_l52w = df_cleaned.groupby(['Item_Code', 'Market_Name'])['Count_Week_Instock'].transform('mean')
  # Replace NaN values in 'L52W in Stock' with the calculated average
  df_cleaned['Count_Week_Instock'] = df_cleaned['Count_Week_Instock'].fillna(average_l52w)

  df_cleaned['Market_Sales_L52wk'] = df_cleaned.groupby(['Item_Code'])['Market_Sales_L52wk'].transform(
    lambda x: x.fillna(x.mean()))

  df_cleaned['Market_Sales_L52wk'] = df_cleaned['Market_Sales_L52wk'].fillna(
    df_cleaned.groupby(['Item_Code'])['Normalized_Sales_$L52W'].transform('mean'))

  df_cleaned['Market_Sales_L52wk'] = df_cleaned['Market_Sales_L52wk'].fillna(
    df_cleaned.groupby(['Item_Code'])['Normalized_Sales_$L52W'].transform('mean'))

  return df_cleaned


# Assuming your DataFrame is named 'df'
# Replace 'df' with the actual name of your DataFrame.

df_cleaned = fill_NAs(df)

In [98]:
columns_to_log = [ 'Retail_Price', 'Households', 'Average_Net_Worth', 'Store_Age_Days',
                  'Vodka_Sales_Factor', 'Household_Income_to_NetWorth_Ratio',
                   'Wealth_Diversity_Index', 'Normalized_Sales_$L52W', 'Market_Sales_L52wk']

for column in columns_to_log:
  # Add a small constant to avoid taking the log of zero
  df_cleaned['log_' + column] = np.log(df_cleaned[column] + 1e-6)

In [99]:
# Define the bins for Average Net Worth
bins = [df_cleaned["Average_Net_Worth"].min(), df_cleaned["Average_Net_Worth"].quantile(0.25), df_cleaned["Average_Net_Worth"].quantile(0.5), df_cleaned["Average_Net_Worth"].quantile(0.75), df_cleaned["Average_Net_Worth"].max()]
labels = ['Low', 'Medium', 'High', 'Very High']

# Assuming 'Average Net Worth' is a column in your DataFrame 'df_cleaned'
# You can replace 'Average Net Worth' with the actual column name in your DataFrame.

df_cleaned['Net_Worth_Category'] = pd.cut(df_cleaned['Average_Net_Worth'], bins=bins, labels=labels, include_lowest=True)

# Print the value counts of the new category column
print(df_cleaned['Net_Worth_Category'].value_counts())

Net_Worth_Category
Low          15445
Medium       15302
Very High    15229
High         14997
Name: count, dtype: int64


In [100]:
# Create the 'Total Wine Partners' column and initialize it to 0
total_wine_partners = [
    "Tower Vodka", "Summum Vodka", "Hope Vodka", "Purity Vodka Connoisseur 51", "Classic Club Vodka",
    "The American Plains Vodka", "Froggy B Vodka", "Roberto Cavalli Vodka", "Akva Organic Swedish Vodka",
    "Gallant Vodka", "Opulent Vodka", "Veil Vodka", "Starr Blu Vodka", "Lyna Vodka", "Karkov Vodka",
    "Prairie Organic Vodka", "Pau Maui Handcrafted Vodka", "Eight Degrees Vodka", "Esme Black Shield Vodka",
    "Greenhouse Organic Vodka", "ABK6 Organic Vodka", "Ivanhalder's 1815 Vodka", "Stateside Urbancraft Vodka",
    "3 Howls Blood Orange Vodka"
]

# Create the 'Spirits_Direct' column and initialize it to 0
df_cleaned['Spirits_Direct'] = 0

# Check if the 'Item_Name' column contains any partnered brand name
df_cleaned['Spirits_Direct'] = df_cleaned['Item_Name'].apply(
    lambda x: 1 if any(partner in str(x) for partner in total_wine_partners) else 0
)

In [101]:
# Define a list of common flavor keywords associated with flavored vodkas
flavor_keywords = [
    'Apple', 'Apricot', 'Berry', 'Blackberry', 'Blueberry', 'Cherry', 'Citrus', 'Coconut', 'Cranberry',
    'Grape', 'Grapefruit', 'Lemon', 'Lime', 'Mango', 'Melon', 'Orange', 'Peach', 'Pear', 'Pineapple',
    'Raspberry', 'Strawberry', 'Vanilla', 'Watermelon', 'Pepper', 'Chocolate', 'Espresso', 'Coffee',
    'Caramel', 'Honey', 'Cinnamon', 'Peppermint', 'Whipped', 'Cake', 'Marshmallow', 'Butterscotch',
    'vanilla', 'citrus', 'berry', 'peach', 'apple', 'lemon', 'lime', 'orange', 'raspberry',
    'strawberry', 'cherry', 'pineapple', 'mango', 'coconut', 'pepper', 'chocolate', 'caramel',
    'coffee', 'espresso', 'honey', 'ginger', 'melon', 'grape', 'pomegranate', 'watermelon',
    'cucumber', 'peppermint', 'cinnamon', 'spice', 'tea', 'mint', 'cake', 'whipped', 'cream',
    'butterscotch', 'toffee', 'hazelnut', 'almond', 'fig', 'apricot', 'pear', 'passion fruit',
    'kiwi', 'blueberry', 'blackberry', 'cranberry', 'grapefruit', 'tangerine', 'blood orange',
    'hibiscus', 'lavender', 'rose', 'elderflower', 'lychee', 'papaya', 'guava', 'dragonfruit',
    'acai', 'jalapeno', 'chipotle', 'sriracha', 'wasabi', 'bacon', 'smoked', 'maple', 'pumpkin',
    'smores', 'birthday cake', 'cotton candy', 'bubblegum', 'root beer', 'cola', 'gingerbread',
    'candy cane', 'sugar cookie', 'salted caramel', 'pecan', 'walnut', 'macadamia', 'pistachio',
    'chai', 'matcha', 'mocha', 'hazelnut', 'almond', 'butter', 'biscuit', 'cookie', 'brownie',
    'fudge', 'truffle', 'nougat', 'praline', 'marzipan', 'amaretto', 'tiramisu', 'baklava',
    'bakery', 'dessert', 'pastry', 'confection', 'sweet', 'sour', 'spicy', 'herb', 'botanical',
    'infusion', 'essence', 'extract', 'liqueur', 'cordial', 'schnapps'
]

# Create the 'Flavored_Vodka' column and initialize it to 0
df_cleaned['Flavored_Vodka'] = 0

# Function to check if any flavor keyword is in the item name
def is_flavored_vodka(item_name):
    # Convert item name to string and check for each keyword
    return any(keyword.lower() in str(item_name).lower() for keyword in flavor_keywords)

# Apply the function to the 'Item_Name' column
df_cleaned['Flavored_Vodka'] = df_cleaned['Item_Name'].apply(is_flavored_vodka).astype(int)

In [102]:
# Brand recognition
brand_keywords = [
    'Smirnoff', 'Absolut', 'Zubrowka', 'Magic Moments', 'Arkhangelskaya', 'Zoladkowa', 'Grey Goose', 'Soplica', 'Pyat Ozer',
    'Nemiroff', 'Belenkaya', 'Skyy', 'Talka', 'Ketel One', 'Finlandia', 'Russian Standard', 'Wodka Gorbatschow', 'Tsarskaya',
    'Imperial Collection Gold', 'Green Mark', 'Belaya Bereza'
]

# Create the 'Top20_Vodka' column and initialize it to 0
df_cleaned['Top20_Vodka'] = 0

# Function to check if any brand keyword is in the item name
def is_top20_vodka(item_name):
    # Convert item name to string and check for each keyword
    return any(keyword.lower() in str(item_name).lower() for keyword in brand_keywords)

# Apply the function to the 'Item_Name' column
df_cleaned['Top20_Vodka'] = df_cleaned['Item_Name'].apply(is_top20_vodka).astype(int)

In [103]:
# List of columns to process
columns_to_engineer = [
    'US Whiskey', 'Tequila Under $65', 'Tequila Over $65',
    'Scotch Under $75', 'Scotch Over $75', 'Vodka', 'Cordials',
    'Brandy Under $85', 'Brandy Over $85', 'Cabernet Under $20', 'Cabernet $20-50', 'Cabernet Over $50', 'Chardonnay Under $20', 'Chardonnay Over $20', 'Wine - Sparkling', 'Pinot Noir Under $20', 'Pinot Noir Over $20', 'Sauvignon Blanc', 'French Champagne'
]

for col in columns_to_engineer:
    df_cleaned[f'{col}_engineered'] = df_cleaned[col] + 1

df_cleaned['Vodka_Tequila_Under_65_Ratio'] = df_cleaned['Vodka_engineered']/df_cleaned['Tequila Under $65_engineered']
df_cleaned['Vodka_Tequila_Over_65_Ratio'] = df_cleaned['Vodka_engineered']/df_cleaned['Tequila Over $65_engineered']

# Calculate the average wine value
wine_columns = [
    'Cabernet Under $20_engineered', 'Cabernet $20-50_engineered', 'Cabernet Over $50_engineered',
    'Chardonnay Under $20_engineered', 'Chardonnay Over $20_engineered', 'Wine - Sparkling_engineered',
    'Pinot Noir Under $20_engineered', 'Pinot Noir Over $20_engineered', 'Sauvignon Blanc_engineered',
    'French Champagne_engineered'
]

df_cleaned['Average_Wine_Value'] = df_cleaned[wine_columns].mean(axis=1)

# Create ratio variable
df_cleaned['Vodka_Wine_Ratio'] = df_cleaned['Vodka_engineered']/df_cleaned['Average_Wine_Value']

In [104]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60973 entries, 0 to 60972
Columns: 136 entries, Store_Number to Vodka_Wine_Ratio
dtypes: category(1), float64(37), int64(92), object(6)
memory usage: 62.9+ MB


In [105]:
df_cleaned.columns

Index(['Store_Number', 'Perc_Eastern_Europeans', 'Gender Ratio', 'Temperature',
       'Item_Code', 'Item_Name', 'Retail_Price', 'Count_Week_Instock',
       'Normalized_Sales_$L52W', 'Sales Bucket',
       ...
       'Chardonnay Over $20_engineered', 'Wine - Sparkling_engineered',
       'Pinot Noir Under $20_engineered', 'Pinot Noir Over $20_engineered',
       'Sauvignon Blanc_engineered', 'French Champagne_engineered',
       'Vodka_Tequila_Under_65_Ratio', 'Vodka_Tequila_Over_65_Ratio',
       'Average_Wine_Value', 'Vodka_Wine_Ratio'],
      dtype='object', length=136)

# XGBoost

In [109]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# Define the features and target

# Split the data into 'data' and 'pred' based on 'Sales Bucket'
data = df_cleaned[df_cleaned['Sales Bucket'] == 'Sales included']
pred = df_cleaned[df_cleaned['Sales Bucket'] == 'Sales excluded for model test']

# Define the features and target for 'data'
X = data[['Vodka', 'Store_Size_Extra Large',
          'Store_Size_Large', 'Store_Size_Medium', 'Store_Size_Small', 'Store_State_AZ', 'Store_State_CA',
          'Store_State_CO', 'Store_State_CT', 'Store_State_DE', 'Store_State_FL', 'Store_State_GA',
          'Store_State_IL', 'Store_State_IN', 'Store_State_KS', 'Store_State_KY', 'Store_State_LA',
          'Store_State_MA', 'Store_State_MD', 'Store_State_MI', 'Store_State_MN', 'Store_State_MO',
          'Store_State_NJ', 'Store_State_NM', 'Store_State_NV', 'Store_State_NY', 'Store_State_SC',
          'Store_State_TN', 'Store_State_TX', 'Store_State_WA', 'Store_State_WI', 'Package_Type_1.5L',
          'Package_Type_1.75L', 'Package_Type_1.75Lgft', 'Package_Type_100ml', 'Package_Type_1L',
          'Package_Type_200-3gft', 'Package_Type_200ml', 'Package_Type_375ml', 'Package_Type_700ml',
          'Package_Type_720ml', 'Package_Type_750gft', 'Package_Type_750ml', 'Count_Week_Instock_Normalized',
          'log_Retail_Price', 'log_Households', 'log_Store_Age_Days', 'Cluster_Label',
          'Vodka_Tequila_Under_65_Ratio', 'Vodka_Tequila_Over_65_Ratio', 'Vodka_Wine_Ratio',
          'Spirits_Direct', 'Flavored_Vodka', 'Top20_Vodka']]
y = data['Normalized_Sales_$L52W']

# Split 'data' into training + validation (90%) and holdout (10%)
X_train_val, X_holdout, y_train_val, y_holdout = train_test_split(X, y, test_size=0.1, random_state=42)

# Split the training + validation set into training (80%) and validation (20%)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

# Preprocessing pipeline
numerical_features = ['Vodka', 'Count_Week_Instock_Normalized', 'log_Retail_Price',
                      'log_Households', 'log_Store_Age_Days', 'Vodka_Tequila_Under_65_Ratio',
                      'Vodka_Tequila_Over_65_Ratio', 'Vodka_Wine_Ratio']
categorical_features = ['Store_Size_Extra Large', 'Store_Size_Large', 'Store_Size_Medium',
                        'Store_Size_Small', 'Store_State_AZ', 'Store_State_CA', 'Store_State_CO',
                        'Store_State_CT', 'Store_State_DE', 'Store_State_FL', 'Store_State_GA',
                        'Store_State_IL', 'Store_State_IN', 'Store_State_KS', 'Store_State_KY',
                        'Store_State_LA', 'Store_State_MA', 'Store_State_MD', 'Store_State_MI',
                        'Store_State_MN', 'Store_State_MO', 'Store_State_NJ', 'Store_State_NM',
                        'Store_State_NV', 'Store_State_NY', 'Store_State_SC', 'Store_State_TN',
                        'Store_State_TX', 'Store_State_WA', 'Store_State_WI', 'Package_Type_1.5L',
                        'Package_Type_1.75L', 'Package_Type_1.75Lgft', 'Package_Type_100ml',
                        'Package_Type_1L', 'Package_Type_200-3gft', 'Package_Type_200ml',
                        'Package_Type_375ml', 'Package_Type_700ml', 'Package_Type_720ml',
                        'Package_Type_750gft', 'Package_Type_750ml', 'Cluster_Label',
                        'Spirits_Direct', 'Flavored_Vodka', 'Top20_Vodka']

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)
    ]
)

# Import the required XGBoost library
from xgboost import XGBRegressor

# Initialize the XGBoost regressor
xgboost_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('xgboost', XGBRegressor(
    objective='reg:squarederror',
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=2424
))])

# Train the model
xgboost_pipeline.fit(X_train, y_train)

In [110]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Predict on training, validation, and holdout sets
y_train_pred = xgboost_pipeline.predict(X_train)
y_val_pred = xgboost_pipeline.predict(X_val)
y_holdout_pred = xgboost_pipeline.predict(X_holdout)

# Calculate RMSE and MAE for training, validation, and holdout sets
train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
val_rmse = mean_squared_error(y_val, y_val_pred, squared=False)
holdout_rmse = mean_squared_error(y_holdout, y_holdout_pred, squared=False)

train_mae = mean_absolute_error(y_train, y_train_pred)
val_mae = mean_absolute_error(y_val, y_val_pred)
holdout_mae = mean_absolute_error(y_holdout, y_holdout_pred)

# Calculate R² for training, validation, and holdout sets
train_r2 = xgboost_pipeline.score(X_train, y_train)
val_r2 = xgboost_pipeline.score(X_val, y_val)
holdout_r2 = xgboost_pipeline.score(X_holdout, y_holdout)

# Print metrics
print("XGBoost Model Performance:")
print(f"Training RMSE: {train_rmse:.4f}, MAE: {train_mae:.4f}, R²: {train_r2:.4f}")
print(f"Validation RMSE: {val_rmse:.4f}, MAE: {val_mae:.4f}, R²: {val_r2:.4f}")
print(f"Holdout RMSE: {holdout_rmse:.4f}, MAE: {holdout_mae:.4f}, R²: {holdout_r2:.4f}")



XGBoost Model Performance:
Training RMSE: 5722.4524, MAE: 2138.8038, R²: 0.8814
Validation RMSE: 14647.0310, MAE: 2843.3127, R²: 0.2848
Holdout RMSE: 10962.0688, MAE: 2692.0637, R²: 0.6604


# Random Forest

In [111]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Initialize the Random Forest regressor
random_forest_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('random_forest', RandomForestRegressor(
        n_estimators=500,
        max_depth=10,  # Set max_depth to prevent overfitting; adjust as needed
        min_samples_split=5,  # Controls tree splitting; adjust for performance
        random_state=2424
    ))
])

# Train the Random Forest model
random_forest_pipeline.fit(X_train, y_train)

# Predict on training, validation, and holdout sets
y_train_pred_rf = random_forest_pipeline.predict(X_train)
y_val_pred_rf = random_forest_pipeline.predict(X_val)
y_holdout_pred_rf = random_forest_pipeline.predict(X_holdout)

# Calculate RMSE and MAE for training, validation, and holdout sets
train_rmse_rf = mean_squared_error(y_train, y_train_pred_rf, squared=False)
val_rmse_rf = mean_squared_error(y_val, y_val_pred_rf, squared=False)
holdout_rmse_rf = mean_squared_error(y_holdout, y_holdout_pred_rf, squared=False)

train_mae_rf = mean_absolute_error(y_train, y_train_pred_rf)
val_mae_rf = mean_absolute_error(y_val, y_val_pred_rf)
holdout_mae_rf = mean_absolute_error(y_holdout, y_holdout_pred_rf)

# Calculate R² for training, validation, and holdout sets
train_r2_rf = random_forest_pipeline.score(X_train, y_train)
val_r2_rf = random_forest_pipeline.score(X_val, y_val)
holdout_r2_rf = random_forest_pipeline.score(X_holdout, y_holdout)

# Print metrics
print("Random Forest Model Performance:")
print(f"Training RMSE: {train_rmse_rf:.4f}, MAE: {train_mae_rf:.4f}, R²: {train_r2_rf:.4f}")
print(f"Validation RMSE: {val_rmse_rf:.4f}, MAE: {val_mae_rf:.4f}, R²: {val_r2_rf:.4f}")
print(f"Holdout RMSE: {holdout_rmse_rf:.4f}, MAE: {holdout_mae_rf:.4f}, R²: {holdout_r2_rf:.4f}")



Random Forest Model Performance:
Training RMSE: 6966.4584, MAE: 2156.2656, R²: 0.8242
Validation RMSE: 15635.8856, MAE: 2644.5398, R²: 0.1850
Holdout RMSE: 10916.0043, MAE: 2493.5809, R²: 0.6633


#Ensemble-XGB+RF

In [None]:
import numpy as np
from sklearn.ensemble import VotingRegressor

# Assuming xgboost_pipeline and random_forest_pipeline are already defined and trained

# Create the ensemble model using VotingRegressor
ensemble_model = VotingRegressor(
    estimators=[('xgboost', xgboost_pipeline), ('random_forest', random_forest_pipeline)],
    weights=[1,1] #weights can be adjusted based on model performance
)

# Train the ensemble model
ensemble_model.fit(X_train, y_train)

# Predict on training, validation, and holdout sets
y_train_pred_ensemble = ensemble_model.predict(X_train)
y_val_pred_ensemble = ensemble_model.predict(X_val)
y_holdout_pred_ensemble = ensemble_model.predict(X_holdout)

# Evaluate the ensemble model
from sklearn.metrics import mean_squared_error, mean_absolute_error

train_rmse_ensemble = mean_squared_error(y_train, y_train_pred_ensemble, squared=False)
val_rmse_ensemble = mean_squared_error(y_val, y_val_pred_ensemble, squared=False)
holdout_rmse_ensemble = mean_squared_error(y_holdout, y_holdout_pred_ensemble, squared=False)

train_mae_ensemble = mean_absolute_error(y_train, y_train_pred_ensemble)
val_mae_ensemble = mean_absolute_error(y_val, y_val_pred_ensemble)
holdout_mae_ensemble = mean_absolute_error(y_holdout, y_holdout_pred_ensemble)

train_r2_ensemble = ensemble_model.score(X_train, y_train)
val_r2_ensemble = ensemble_model.score(X_val, y_val)
holdout_r2_ensemble = ensemble_model.score(X_holdout, y_holdout)

print("Ensemble Model Performance:")
print(f"Training RMSE: {train_rmse_ensemble:.4f}, MAE: {train_mae_ensemble:.4f}, R²: {train_r2_ensemble:.4f}")
print(f"Validation RMSE: {val_rmse_ensemble:.4f}, MAE: {val_mae_ensemble:.4f}, R²: {val_r2_ensemble:.4f}")
print(f"Holdout RMSE: {holdout_rmse_ensemble:.4f}, MAE: {holdout_mae_ensemble:.4f}, R²: {holdout_r2_ensemble:.4f}")



Ensemble Model Performance:
Training RMSE: 6054.1962, MAE: 2060.2014, R²: 0.8672
Validation RMSE: 15094.6589, MAE: 2660.9694, R²: 0.2405
Holdout RMSE: 10868.7632, MAE: 2488.8975, R²: 0.6662


# Ensemble 5

In [112]:
# Define the new boosting models
lightgbm_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('lightgbm', LGBMRegressor(random_state=42,
     n_estimators=500,
    learning_rate=0.05,
    max_depth=6))
])

catboost_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('catboost', CatBoostRegressor(verbose=0, random_state=42,
     n_estimators=500,
    learning_rate=0.05,
    max_depth=6))
])

adaboost_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('adaboost', AdaBoostRegressor(random_state=42,
     n_estimators=500,
    learning_rate=0.05))
])

# Updated Ensemble Model with additional boosting models
ensemble_model = VotingRegressor(
    estimators=[
        ('xgboost', xgboost_pipeline),
        ('random_forest', random_forest_pipeline),
        ('lightgbm', lightgbm_pipeline),
        ('catboost', catboost_pipeline),
        ('adaboost', adaboost_pipeline)
    ],
    weights=[1, 1, 1, 1, 1]  # Equal weight for all models
)

# Train the ensemble model
ensemble_model.fit(X_train, y_train)

# Evaluate the Ensemble model
y_train_pred_ensemble = ensemble_model.predict(X_train)
y_val_pred_ensemble = ensemble_model.predict(X_val)
y_holdout_pred_ensemble = ensemble_model.predict(X_holdout)

train_rmse_ensemble = mean_squared_error(y_train, y_train_pred_ensemble, squared=False)
val_rmse_ensemble = mean_squared_error(y_val, y_val_pred_ensemble, squared=False)
holdout_rmse_ensemble = mean_squared_error(y_holdout, y_holdout_pred_ensemble, squared=False)

train_mae_ensemble = mean_absolute_error(y_train, y_train_pred_ensemble)
val_mae_ensemble = mean_absolute_error(y_val, y_val_pred_ensemble)
holdout_mae_ensemble = mean_absolute_error(y_holdout, y_holdout_pred_ensemble)

train_r2_ensemble = ensemble_model.score(X_train, y_train)
val_r2_ensemble = ensemble_model.score(X_val, y_val)
holdout_r2_ensemble = ensemble_model.score(X_holdout, y_holdout)

print("Ensemble Model Performance:")
print(f"Training RMSE: {train_rmse_ensemble:.4f}, MAE: {train_mae_ensemble:.4f}, R²: {train_r2_ensemble:.4f}")
print(f"Validation RMSE: {val_rmse_ensemble:.4f}, MAE: {val_mae_ensemble:.4f}, R²: {val_r2_ensemble:.4f}")
print(f"Holdout RMSE: {holdout_rmse_ensemble:.4f}, MAE: {holdout_mae_ensemble:.4f}, R²: {holdout_r2_ensemble:.4f}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002012 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 949
[LightGBM] [Info] Number of data points in the train set: 39523, number of used features: 63
[LightGBM] [Info] Start training from score 3927.968702




Ensemble Model Performance:
Training RMSE: 7819.6576, MAE: 3127.9306, R²: 0.7784
Validation RMSE: 13468.9323, MAE: 3552.7555, R²: 0.3953
Holdout RMSE: 11252.5350, MAE: 3433.1455, R²: 0.6422


In [113]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Updated boosting models
xgboost_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('xgboost', XGBRegressor(
        objective='reg:squarederror',
        random_state=42,
        n_estimators=400,       # Reduced to prevent overfitting
        learning_rate=0.03,    # Lower learning rate for better generalization
        max_depth=6,
        reg_alpha=1.5,         # Increased L1 regularization
        reg_lambda=1.5,        # Increased L2 regularization
        subsample=0.8,         # Subsampling to prevent overfitting
        colsample_bytree=0.8,
        min_child_weight=10    # Prevents overfitting small splits
    ))
])

lightgbm_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('lightgbm', LGBMRegressor(
        objective='regression',
        random_state=42,
        n_estimators=400,
        learning_rate=0.03,
        max_depth=6,
        reg_alpha=1.5,
        reg_lambda=1.5,
        num_leaves=25,        # Fewer leaves to improve generalization
        subsample=0.8,
        colsample_bytree=0.8
    ))
])

catboost_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('catboost', CatBoostRegressor(
        verbose=0,
        random_state=42,
        iterations=400,
        learning_rate=0.03,
        depth=6,
        l2_leaf_reg=3.0,     # Increased regularization
        subsample=0.8,
        early_stopping_rounds=50  # Early stopping to prevent overfitting
    ))
])

adaboost_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('adaboost', AdaBoostRegressor(
        random_state=42,
        n_estimators=300,    # Reduced number of estimators
        learning_rate=0.03   # Lower learning rate for smoother fit
    ))
])

random_forest_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('random_forest', RandomForestRegressor(
        random_state=42,
        n_estimators=300,    # Balanced tree count
        max_depth=8,         # Reduced depth for generalization
        min_samples_split=10,
        min_samples_leaf=5,
        max_features='sqrt'
    ))
])

# Updated ensemble model
ensemble_model = VotingRegressor(
    estimators=[
        ('xgboost', xgboost_pipeline),
        ('lightgbm', lightgbm_pipeline),
        ('catboost', catboost_pipeline),
        ('adaboost', adaboost_pipeline),
        ('random_forest', random_forest_pipeline)
    ],
    weights=[1.2, 1.2, 1.0, 0.8, 1.0]  # Emphasize XGBoost and LightGBM
)

# Train the ensemble model
ensemble_model.fit(X_train, y_train)

# Evaluate the ensemble model
y_train_pred_ensemble = ensemble_model.predict(X_train)
y_val_pred_ensemble = ensemble_model.predict(X_val)
y_holdout_pred_ensemble = ensemble_model.predict(X_holdout)

train_rmse_ensemble = mean_squared_error(y_train, y_train_pred_ensemble, squared=False)
val_rmse_ensemble = mean_squared_error(y_val, y_val_pred_ensemble, squared=False)
holdout_rmse_ensemble = mean_squared_error(y_holdout, y_holdout_pred_ensemble, squared=False)

train_mae_ensemble = mean_absolute_error(y_train, y_train_pred_ensemble)
val_mae_ensemble = mean_absolute_error(y_val, y_val_pred_ensemble)
holdout_mae_ensemble = mean_absolute_error(y_holdout, y_holdout_pred_ensemble)

train_r2_ensemble = ensemble_model.score(X_train, y_train)
val_r2_ensemble = ensemble_model.score(X_val, y_val)
holdout_r2_ensemble = ensemble_model.score(X_holdout, y_holdout)

# Print results
print("Updated Ensemble Model Performance:")
print(f"Training RMSE: {train_rmse_ensemble:.4f}, MAE: {train_mae_ensemble:.4f}, R²: {train_r2_ensemble:.4f}")
print(f"Validation RMSE: {val_rmse_ensemble:.4f}, MAE: {val_mae_ensemble:.4f}, R²: {val_r2_ensemble:.4f}")
print(f"Holdout RMSE: {holdout_rmse_ensemble:.4f}, MAE: {holdout_mae_ensemble:.4f}, R²: {holdout_r2_ensemble:.4f}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002410 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 949
[LightGBM] [Info] Number of data points in the train set: 39523, number of used features: 63
[LightGBM] [Info] Start training from score 3927.968702




Updated Ensemble Model Performance:
Training RMSE: 10478.3422, MAE: 3369.8659, R²: 0.6022
Validation RMSE: 12901.2331, MAE: 3619.2754, R²: 0.4452
Holdout RMSE: 12749.2995, MAE: 3586.3509, R²: 0.5406


In [114]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Updated boosting models
xgboost_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('xgboost', XGBRegressor(
        objective='reg:squarederror',
        random_state=42,
        n_estimators=350,       # Reduced to prevent overfitting
        learning_rate=0.02,    # Lower learning rate for better generalization
        max_depth=5,
        reg_alpha=2.0,         # Increased L1 regularization
        reg_lambda=2.0,        # Increased L2 regularization
        subsample=0.85,        # Slightly increased subsampling
        colsample_bytree=0.8,
        min_child_weight=8     # Balanced child weight to reduce overfitting
    ))
])

lightgbm_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('lightgbm', LGBMRegressor(
        objective='regression',
        random_state=42,
        n_estimators=350,
        learning_rate=0.02,
        max_depth=5,
        reg_alpha=2.0,
        reg_lambda=2.0,
        num_leaves=20,        # Fewer leaves for better generalization
        subsample=0.85,
        colsample_bytree=0.8
    ))
])

catboost_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('catboost', CatBoostRegressor(
        verbose=0,
        random_state=42,
        iterations=350,
        learning_rate=0.02,
        depth=5,
        l2_leaf_reg=4.0,     # Increased regularization
        subsample=0.85,
        early_stopping_rounds=50  # Early stopping to prevent overfitting
    ))
])

adaboost_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('adaboost', AdaBoostRegressor(
        random_state=42,
        n_estimators=300,    # Reduced number of estimators
        learning_rate=0.02   # Lower learning rate for smoother fit
    ))
])

random_forest_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('random_forest', RandomForestRegressor(
        random_state=42,
        n_estimators=250,    # Balanced tree count
        max_depth=7,         # Reduced depth for generalization
        min_samples_split=12,
        min_samples_leaf=6,
        max_features='sqrt'
    ))
])

# Updated ensemble model
ensemble_model = VotingRegressor(
    estimators=[
        ('xgboost', xgboost_pipeline),
        ('lightgbm', lightgbm_pipeline),
        ('catboost', catboost_pipeline),
        ('adaboost', adaboost_pipeline),
        ('random_forest', random_forest_pipeline)
    ],
    weights=[1.3, 1.3, 1.0, 0.7, 1.0]  # Emphasize XGBoost and LightGBM
)

# Train the ensemble model
ensemble_model.fit(X_train, y_train)

# Evaluate the ensemble model
y_train_pred_ensemble = ensemble_model.predict(X_train)
y_val_pred_ensemble = ensemble_model.predict(X_val)
y_holdout_pred_ensemble = ensemble_model.predict(X_holdout)

train_rmse_ensemble = mean_squared_error(y_train, y_train_pred_ensemble, squared=False)
val_rmse_ensemble = mean_squared_error(y_val, y_val_pred_ensemble, squared=False)
holdout_rmse_ensemble = mean_squared_error(y_holdout, y_holdout_pred_ensemble, squared=False)

train_mae_ensemble = mean_absolute_error(y_train, y_train_pred_ensemble)
val_mae_ensemble = mean_absolute_error(y_val, y_val_pred_ensemble)
holdout_mae_ensemble = mean_absolute_error(y_holdout, y_holdout_pred_ensemble)

train_r2_ensemble = ensemble_model.score(X_train, y_train)
val_r2_ensemble = ensemble_model.score(X_val, y_val)
holdout_r2_ensemble = ensemble_model.score(X_holdout, y_holdout)

# Print results
print("Updated Ensemble Model Performance:")
print(f"Training RMSE: {train_rmse_ensemble:.4f}, MAE: {train_mae_ensemble:.4f}, R²: {train_r2_ensemble:.4f}")
print(f"Validation RMSE: {val_rmse_ensemble:.4f}, MAE: {val_mae_ensemble:.4f}, R²: {val_r2_ensemble:.4f}")
print(f"Holdout RMSE: {holdout_rmse_ensemble:.4f}, MAE: {holdout_mae_ensemble:.4f}, R²: {holdout_r2_ensemble:.4f}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001724 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 949
[LightGBM] [Info] Number of data points in the train set: 39523, number of used features: 63
[LightGBM] [Info] Start training from score 3927.968702




Updated Ensemble Model Performance:
Training RMSE: 11974.8823, MAE: 3572.9256, R²: 0.4804
Validation RMSE: 13512.8749, MAE: 3781.1948, R²: 0.3913
Holdout RMSE: 14062.7866, MAE: 3773.8472, R²: 0.4411


In [115]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Updated boosting models
xgboost_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('xgboost', XGBRegressor(
        objective='reg:squarederror',
        random_state=42,
        n_estimators=400,       # Slightly increased for better generalization
        learning_rate=0.03,    # Lower learning rate for smoother convergence
        max_depth=6,            # Increased to capture more interactions
        reg_alpha=1.5,          # Balanced regularization
        reg_lambda=1.5,         # Balanced regularization
        subsample=0.85,         # Avoid overfitting
        colsample_bytree=0.9,
        min_child_weight=6      # Adjusted for moderate regularization
    ))
])

lightgbm_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('lightgbm', LGBMRegressor(
        objective='regression',
        random_state=42,
        n_estimators=400,
        learning_rate=0.03,
        max_depth=7,            # Slightly deeper trees
        reg_alpha=1.5,
        reg_lambda=1.5,
        num_leaves=30,          # Increased leaves for richer patterns
        subsample=0.85,
        colsample_bytree=0.9
    ))
])

catboost_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('catboost', CatBoostRegressor(
        verbose=0,
        random_state=42,
        iterations=400,
        learning_rate=0.03,
        depth=6,               # Balanced depth
        l2_leaf_reg=3.5,       # Reduced regularization for improved fit
        subsample=0.85,
        early_stopping_rounds=50  # Early stopping for better generalization
    ))
])

adaboost_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('adaboost', AdaBoostRegressor(
        random_state=42,
        n_estimators=350,       # Increased estimators for better fitting
        learning_rate=0.03      # Lower learning rate for smoother fit
    ))
])

random_forest_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('random_forest', RandomForestRegressor(
        random_state=42,
        n_estimators=300,       # Balanced tree count
        max_depth=8,            # Increased depth for richer splits
        min_samples_split=10,
        min_samples_leaf=4,     # Reduced leaf size to capture more detail
        max_features='sqrt'
    ))
])

# Updated ensemble model
ensemble_model = VotingRegressor(
    estimators=[
        ('xgboost', xgboost_pipeline),
        ('lightgbm', lightgbm_pipeline),
        ('catboost', catboost_pipeline),
        ('adaboost', adaboost_pipeline),
        ('random_forest', random_forest_pipeline)
    ],
    weights=[1.5, 1.5, 1.0, 0.8, 1.0]  # Increased emphasis on XGBoost and LightGBM
)

# Train the ensemble model
ensemble_model.fit(X_train, y_train)

# Evaluate the ensemble model
y_train_pred_ensemble = ensemble_model.predict(X_train)
y_val_pred_ensemble = ensemble_model.predict(X_val)
y_holdout_pred_ensemble = ensemble_model.predict(X_holdout)

train_rmse_ensemble = mean_squared_error(y_train, y_train_pred_ensemble, squared=False)
val_rmse_ensemble = mean_squared_error(y_val, y_val_pred_ensemble, squared=False)
holdout_rmse_ensemble = mean_squared_error(y_holdout, y_holdout_pred_ensemble, squared=False)

train_mae_ensemble = mean_absolute_error(y_train, y_train_pred_ensemble)
val_mae_ensemble = mean_absolute_error(y_val, y_val_pred_ensemble)
holdout_mae_ensemble = mean_absolute_error(y_holdout, y_holdout_pred_ensemble)

train_r2_ensemble = ensemble_model.score(X_train, y_train)
val_r2_ensemble = ensemble_model.score(X_val, y_val)
holdout_r2_ensemble = ensemble_model.score(X_holdout, y_holdout)

# Print results
print("Updated Ensemble Model Performance:")
print(f"Training RMSE: {train_rmse_ensemble:.4f}, MAE: {train_mae_ensemble:.4f}, R²: {train_r2_ensemble:.4f}")
print(f"Validation RMSE: {val_rmse_ensemble:.4f}, MAE: {val_mae_ensemble:.4f}, R²: {val_r2_ensemble:.4f}")
print(f"Holdout RMSE: {holdout_rmse_ensemble:.4f}, MAE: {holdout_mae_ensemble:.4f}, R²: {holdout_r2_ensemble:.4f}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002315 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 949
[LightGBM] [Info] Number of data points in the train set: 39523, number of used features: 63
[LightGBM] [Info] Start training from score 3927.968702




Updated Ensemble Model Performance:
Training RMSE: 9847.6171, MAE: 3192.9381, R²: 0.6486
Validation RMSE: 12705.4757, MAE: 3472.8355, R²: 0.4619
Holdout RMSE: 12253.9294, MAE: 3416.7250, R²: 0.5756


In [116]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Updated boosting models
xgboost_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('xgboost', XGBRegressor(
        objective='reg:squarederror',
        random_state=42,
        n_estimators=350,       # Adjusted for smoother convergence
        learning_rate=0.025,    # Lower learning rate
        max_depth=6,            # Balanced depth for validation
        reg_alpha=1.8,          # Adjusted L1 regularization
        reg_lambda=2.2,         # Adjusted L2 regularization
        subsample=0.85,         # Controlled subsampling
        colsample_bytree=0.9,
        min_child_weight=7      # Adjusted for generalization
    ))
])

lightgbm_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('lightgbm', LGBMRegressor(
        objective='regression',
        random_state=42,
        n_estimators=350,
        learning_rate=0.025,
        max_depth=7,            # Slightly deeper trees for richer splits
        reg_alpha=1.8,
        reg_lambda=2.2,
        num_leaves=25,          # Balanced leaf count
        subsample=0.85,
        colsample_bytree=0.9
    ))
])

catboost_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('catboost', CatBoostRegressor(
        verbose=0,
        random_state=42,
        iterations=350,
        learning_rate=0.025,
        depth=6,               # Balanced depth
        l2_leaf_reg=4.0,       # Regularization for generalization
        subsample=0.85,
        early_stopping_rounds=50
    ))
])

adaboost_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('adaboost', AdaBoostRegressor(
        random_state=42,
        n_estimators=300,      # Balanced estimator count
        learning_rate=0.025    # Lower learning rate for smoother fit
    ))
])

random_forest_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('random_forest', RandomForestRegressor(
        random_state=42,
        n_estimators=250,
        max_depth=8,
        min_samples_split=10,
        min_samples_leaf=5,
        max_features='sqrt'
    ))
])

# Updated ensemble model
ensemble_model = VotingRegressor(
    estimators=[
        ('xgboost', xgboost_pipeline),
        ('lightgbm', lightgbm_pipeline),
        ('catboost', catboost_pipeline),
        ('adaboost', adaboost_pipeline),
        ('random_forest', random_forest_pipeline)
    ],
    weights=[1.6, 1.6, 1.0, 0.7, 1.0]  # Increased emphasis on XGBoost and LightGBM
)

# Train the ensemble model
ensemble_model.fit(X_train, y_train)

# Evaluate the ensemble model
y_train_pred_ensemble = ensemble_model.predict(X_train)
y_val_pred_ensemble = ensemble_model.predict(X_val)
y_holdout_pred_ensemble = ensemble_model.predict(X_holdout)

train_rmse_ensemble = mean_squared_error(y_train, y_train_pred_ensemble, squared=False)
val_rmse_ensemble = mean_squared_error(y_val, y_val_pred_ensemble, squared=False)
holdout_rmse_ensemble = mean_squared_error(y_holdout, y_holdout_pred_ensemble, squared=False)

train_mae_ensemble = mean_absolute_error(y_train, y_train_pred_ensemble)
val_mae_ensemble = mean_absolute_error(y_val, y_val_pred_ensemble)
holdout_mae_ensemble = mean_absolute_error(y_holdout, y_holdout_pred_ensemble)

train_r2_ensemble = ensemble_model.score(X_train, y_train)
val_r2_ensemble = ensemble_model.score(X_val, y_val)
holdout_r2_ensemble = ensemble_model.score(X_holdout, y_holdout)

# Print results
print("Final Updated Ensemble Model Performance:")
print(f"Training RMSE: {train_rmse_ensemble:.4f}, MAE: {train_mae_ensemble:.4f}, R²: {train_r2_ensemble:.4f}")
print(f"Validation RMSE: {val_rmse_ensemble:.4f}, MAE: {val_mae_ensemble:.4f}, R²: {val_r2_ensemble:.4f}")
print(f"Holdout RMSE: {holdout_rmse_ensemble:.4f}, MAE: {holdout_mae_ensemble:.4f}, R²: {holdout_r2_ensemble:.4f}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001714 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 949
[LightGBM] [Info] Number of data points in the train set: 39523, number of used features: 63
[LightGBM] [Info] Start training from score 3927.968702




Final Updated Ensemble Model Performance:
Training RMSE: 10231.5396, MAE: 3202.6527, R²: 0.6207
Validation RMSE: 12620.7782, MAE: 3451.5310, R²: 0.4690
Holdout RMSE: 12437.9855, MAE: 3403.3793, R²: 0.5628


In [117]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Updated boosting models
xgboost_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('xgboost', XGBRegressor(
        objective='reg:squarederror',
        random_state=42,
        n_estimators=350,       # Adjusted for smoother convergence
        learning_rate=0.025,    # Lower learning rate
        max_depth=6,            # Balanced depth for validation
        reg_alpha=1.8,          # Adjusted L1 regularization
        reg_lambda=2.2,         # Adjusted L2 regularization
        subsample=0.85,         # Controlled subsampling
        colsample_bytree=0.9,
        min_child_weight=7      # Adjusted for generalization
    ))
])

lightgbm_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('lightgbm', LGBMRegressor(
        objective='regression',
        random_state=42,
        n_estimators=350,
        learning_rate=0.025,
        max_depth=7,            # Slightly deeper trees for richer splits
        reg_alpha=1.8,
        reg_lambda=2.2,
        num_leaves=25,          # Balanced leaf count
        subsample=0.85,
        colsample_bytree=0.9
    ))
])

catboost_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('catboost', CatBoostRegressor(
        verbose=0,
        random_state=42,
        iterations=350,
        learning_rate=0.025,
        depth=6,               # Balanced depth
        l2_leaf_reg=4.0,       # Regularization for generalization
        subsample=0.85,
        early_stopping_rounds=50
    ))
])

adaboost_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('adaboost', AdaBoostRegressor(
        random_state=42,
        n_estimators=300,      # Balanced estimator count
        learning_rate=0.025    # Lower learning rate for smoother fit
    ))
])

random_forest_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('random_forest', RandomForestRegressor(
        random_state=42,
        n_estimators=250,
        max_depth=8,
        min_samples_split=10,
        min_samples_leaf=5,
        max_features='sqrt'
    ))
])

# Updated ensemble model
ensemble_model = VotingRegressor(
    estimators=[
        ('xgboost', xgboost_pipeline),
        ('lightgbm', lightgbm_pipeline),
        ('catboost', catboost_pipeline),
        ('adaboost', adaboost_pipeline),
        ('random_forest', random_forest_pipeline)
    ],
    weights=[1.6, 1.6, 1.0, 0.7, 0.5]  # Increased emphasis on XGBoost and LightGBM
)

# Train the ensemble model
ensemble_model.fit(X_train, y_train)

# Evaluate the ensemble model
y_train_pred_ensemble = ensemble_model.predict(X_train)
y_val_pred_ensemble = ensemble_model.predict(X_val)
y_holdout_pred_ensemble = ensemble_model.predict(X_holdout)

train_rmse_ensemble = mean_squared_error(y_train, y_train_pred_ensemble, squared=False)
val_rmse_ensemble = mean_squared_error(y_val, y_val_pred_ensemble, squared=False)
holdout_rmse_ensemble = mean_squared_error(y_holdout, y_holdout_pred_ensemble, squared=False)

train_mae_ensemble = mean_absolute_error(y_train, y_train_pred_ensemble)
val_mae_ensemble = mean_absolute_error(y_val, y_val_pred_ensemble)
holdout_mae_ensemble = mean_absolute_error(y_holdout, y_holdout_pred_ensemble)

train_r2_ensemble = ensemble_model.score(X_train, y_train)
val_r2_ensemble = ensemble_model.score(X_val, y_val)
holdout_r2_ensemble = ensemble_model.score(X_holdout, y_holdout)

# Print results
print("Final Updated Ensemble Model Performance:")
print(f"Training RMSE: {train_rmse_ensemble:.4f}, MAE: {train_mae_ensemble:.4f}, R²: {train_r2_ensemble:.4f}")
print(f"Validation RMSE: {val_rmse_ensemble:.4f}, MAE: {val_mae_ensemble:.4f}, R²: {val_r2_ensemble:.4f}")
print(f"Holdout RMSE: {holdout_rmse_ensemble:.4f}, MAE: {holdout_mae_ensemble:.4f}, R²: {holdout_r2_ensemble:.4f}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001693 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 949
[LightGBM] [Info] Number of data points in the train set: 39523, number of used features: 63
[LightGBM] [Info] Start training from score 3927.968702




Final Updated Ensemble Model Performance:
Training RMSE: 9926.2790, MAE: 3174.8362, R²: 0.6430
Validation RMSE: 12553.8697, MAE: 3431.1843, R²: 0.4746
Holdout RMSE: 12174.5629, MAE: 3374.0516, R²: 0.5811


In [118]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Updated boosting models
xgboost_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('xgboost', XGBRegressor(
        objective='reg:squarederror',
        random_state=42,
        n_estimators=350,       # Adjusted for smoother convergence
        learning_rate=0.025,    # Lower learning rate
        max_depth=6,            # Balanced depth for validation
        reg_alpha=1.8,          # Adjusted L1 regularization
        reg_lambda=2.2,         # Adjusted L2 regularization
        subsample=0.85,         # Controlled subsampling
        colsample_bytree=0.9,
        min_child_weight=7      # Adjusted for generalization
    ))
])

lightgbm_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('lightgbm', LGBMRegressor(
        objective='regression',
        random_state=42,
        n_estimators=350,
        learning_rate=0.025,
        max_depth=7,            # Slightly deeper trees for richer splits
        reg_alpha=1.8,
        reg_lambda=2.2,
        num_leaves=25,          # Balanced leaf count
        subsample=0.85,
        colsample_bytree=0.9
    ))
])

catboost_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('catboost', CatBoostRegressor(
        verbose=0,
        random_state=42,
        iterations=350,
        learning_rate=0.025,
        depth=6,               # Balanced depth
        l2_leaf_reg=4.0,       # Regularization for generalization
        subsample=0.85,
        early_stopping_rounds=50
    ))
])

adaboost_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('adaboost', AdaBoostRegressor(
        random_state=42,
        n_estimators=300,      # Balanced estimator count
        learning_rate=0.025    # Lower learning rate for smoother fit
    ))
])

random_forest_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('random_forest', RandomForestRegressor(
        random_state=42,
        n_estimators=250,
        max_depth=8,
        min_samples_split=10,
        min_samples_leaf=5,
        max_features='sqrt'
    ))
])

# Updated ensemble model
ensemble_model = VotingRegressor(
    estimators=[
        ('xgboost', xgboost_pipeline),
        ('lightgbm', lightgbm_pipeline),
        ('catboost', catboost_pipeline),
        ('adaboost', adaboost_pipeline),
        ('random_forest', random_forest_pipeline)
    ],
    weights=[1.6, 1.6, 0.5, 0.4, 0.5]  # Increased emphasis on XGBoost and LightGBM
)

# Train the ensemble model
ensemble_model.fit(X_train, y_train)

# Evaluate the ensemble model
y_train_pred_ensemble = ensemble_model.predict(X_train)
y_val_pred_ensemble = ensemble_model.predict(X_val)
y_holdout_pred_ensemble = ensemble_model.predict(X_holdout)

train_rmse_ensemble = mean_squared_error(y_train, y_train_pred_ensemble, squared=False)
val_rmse_ensemble = mean_squared_error(y_val, y_val_pred_ensemble, squared=False)
holdout_rmse_ensemble = mean_squared_error(y_holdout, y_holdout_pred_ensemble, squared=False)

train_mae_ensemble = mean_absolute_error(y_train, y_train_pred_ensemble)
val_mae_ensemble = mean_absolute_error(y_val, y_val_pred_ensemble)
holdout_mae_ensemble = mean_absolute_error(y_holdout, y_holdout_pred_ensemble)

train_r2_ensemble = ensemble_model.score(X_train, y_train)
val_r2_ensemble = ensemble_model.score(X_val, y_val)
holdout_r2_ensemble = ensemble_model.score(X_holdout, y_holdout)

# Print results
print("Final Updated Ensemble Model Performance:")
print(f"Training RMSE: {train_rmse_ensemble:.4f}, MAE: {train_mae_ensemble:.4f}, R²: {train_r2_ensemble:.4f}")
print(f"Validation RMSE: {val_rmse_ensemble:.4f}, MAE: {val_mae_ensemble:.4f}, R²: {val_r2_ensemble:.4f}")
print(f"Holdout RMSE: {holdout_rmse_ensemble:.4f}, MAE: {holdout_mae_ensemble:.4f}, R²: {holdout_r2_ensemble:.4f}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001721 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 949
[LightGBM] [Info] Number of data points in the train set: 39523, number of used features: 63
[LightGBM] [Info] Start training from score 3927.968702




Final Updated Ensemble Model Performance:
Training RMSE: 9745.4207, MAE: 2968.7537, R²: 0.6559
Validation RMSE: 12279.3420, MAE: 3231.5905, R²: 0.4974
Holdout RMSE: 11894.8747, MAE: 3166.0593, R²: 0.6001


In [122]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Updated boosting models
xgboost_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('xgboost', XGBRegressor(
        objective='reg:squarederror',
        random_state=4242,
        n_estimators=350,       # Adjusted for smoother convergence
        learning_rate=0.025,    # Lower learning rate
        max_depth=6,            # Balanced depth for validation
        reg_alpha=1.8,          # Adjusted L1 regularization
        reg_lambda=2.2,         # Adjusted L2 regularization
        subsample=0.85,         # Controlled subsampling
        colsample_bytree=0.9,
        min_child_weight=7      # Adjusted for generalization
    ))
])

lightgbm_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('lightgbm', LGBMRegressor(
        objective='regression',
        random_state=4242,
        n_estimators=350,
        learning_rate=0.025,
        max_depth=7,            # Slightly deeper trees for richer splits
        reg_alpha=1.8,
        reg_lambda=2.2,
        num_leaves=25,          # Balanced leaf count
        subsample=0.85,
        colsample_bytree=0.9
    ))
])

catboost_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('catboost', CatBoostRegressor(
        verbose=0,
        random_state=4242,
        iterations=350,
        learning_rate=0.025,
        depth=6,               # Balanced depth
        l2_leaf_reg=4.0,       # Regularization for generalization
        subsample=0.85,
        early_stopping_rounds=50
    ))
])

adaboost_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('adaboost', AdaBoostRegressor(
        random_state=4242,
        n_estimators=300,      # Balanced estimator count
        learning_rate=0.025    # Lower learning rate for smoother fit
    ))
])

random_forest_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('random_forest', RandomForestRegressor(
        random_state=4242,
        n_estimators=250,
        max_depth=8,
        min_samples_split=10,
        min_samples_leaf=5,
        max_features='sqrt'
    ))
])

# Updated ensemble model
ensemble_model = VotingRegressor(
    estimators=[
        ('xgboost', xgboost_pipeline),
        ('lightgbm', lightgbm_pipeline),
        ('catboost', catboost_pipeline),
        ('adaboost', adaboost_pipeline),
        ('random_forest', random_forest_pipeline)
    ],
    weights=[1.6, 1.6, 0.1, 0.1, 0.1]  # Increased emphasis on XGBoost and LightGBM
)

# Train the ensemble model
ensemble_model.fit(X_train, y_train)

# Evaluate the ensemble model
y_train_pred_ensemble = ensemble_model.predict(X_train)
y_val_pred_ensemble = ensemble_model.predict(X_val)
y_holdout_pred_ensemble = ensemble_model.predict(X_holdout)

train_rmse_ensemble = mean_squared_error(y_train, y_train_pred_ensemble, squared=False)
val_rmse_ensemble = mean_squared_error(y_val, y_val_pred_ensemble, squared=False)
holdout_rmse_ensemble = mean_squared_error(y_holdout, y_holdout_pred_ensemble, squared=False)

train_mae_ensemble = mean_absolute_error(y_train, y_train_pred_ensemble)
val_mae_ensemble = mean_absolute_error(y_val, y_val_pred_ensemble)
holdout_mae_ensemble = mean_absolute_error(y_holdout, y_holdout_pred_ensemble)

train_r2_ensemble = ensemble_model.score(X_train, y_train)
val_r2_ensemble = ensemble_model.score(X_val, y_val)
holdout_r2_ensemble = ensemble_model.score(X_holdout, y_holdout)

# Print results
print("Final Updated Ensemble Model Performance:")
print(f"Training RMSE: {train_rmse_ensemble:.4f}, MAE: {train_mae_ensemble:.4f}, R²: {train_r2_ensemble:.4f}")
print(f"Validation RMSE: {val_rmse_ensemble:.4f}, MAE: {val_mae_ensemble:.4f}, R²: {val_r2_ensemble:.4f}")
print(f"Holdout RMSE: {holdout_rmse_ensemble:.4f}, MAE: {holdout_mae_ensemble:.4f}, R²: {holdout_r2_ensemble:.4f}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001842 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 949
[LightGBM] [Info] Number of data points in the train set: 39523, number of used features: 63
[LightGBM] [Info] Start training from score 3927.968702




Final Updated Ensemble Model Performance:
Training RMSE: 9160.9071, MAE: 2638.3682, R²: 0.6959
Validation RMSE: 11918.8231, MAE: 2922.3145, R²: 0.5264
Holdout RMSE: 11254.9076, MAE: 2831.1773, R²: 0.6420


In [120]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Updated boosting models
xgboost_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('xgboost', XGBRegressor(
        objective='reg:squarederror',
        random_state=42,
        n_estimators=350,       # Adjusted for smoother convergence
        learning_rate=0.025,    # Lower learning rate
        max_depth=6,            # Balanced depth for validation
        reg_alpha=1.8,          # Adjusted L1 regularization
        reg_lambda=2.2,         # Adjusted L2 regularization
        subsample=0.85,         # Controlled subsampling
        colsample_bytree=0.9,
        min_child_weight=7      # Adjusted for generalization
    ))
])

lightgbm_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('lightgbm', LGBMRegressor(
        objective='regression',
        random_state=42,
        n_estimators=350,
        learning_rate=0.025,
        max_depth=7,            # Slightly deeper trees for richer splits
        reg_alpha=1.8,
        reg_lambda=2.2,
        num_leaves=25,          # Balanced leaf count
        subsample=0.85,
        colsample_bytree=0.9
    ))
])

catboost_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('catboost', CatBoostRegressor(
        verbose=0,
        random_state=42,
        iterations=350,
        learning_rate=0.025,
        depth=6,               # Balanced depth
        l2_leaf_reg=4.0,       # Regularization for generalization
        subsample=0.85,
        early_stopping_rounds=50
    ))
])

adaboost_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('adaboost', AdaBoostRegressor(
        random_state=42,
        n_estimators=300,      # Balanced estimator count
        learning_rate=0.025    # Lower learning rate for smoother fit
    ))
])

random_forest_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('random_forest', RandomForestRegressor(
        random_state=42,
        n_estimators=250,
        max_depth=8,
        min_samples_split=10,
        min_samples_leaf=5,
        max_features='sqrt'
    ))
])

# Updated ensemble model
ensemble_model = VotingRegressor(
    estimators=[
        ('xgboost', xgboost_pipeline),
        ('lightgbm', lightgbm_pipeline),
        ('catboost', catboost_pipeline),
        ('adaboost', adaboost_pipeline),
        ('random_forest', random_forest_pipeline)
    ],
    weights=[2.0, 1.9, 0.7, 0.4, 0.6]  # Increased emphasis on XGBoost and LightGBM
)

# Train the ensemble model
ensemble_model.fit(X_train, y_train)

# Evaluate the ensemble model
y_train_pred_ensemble = ensemble_model.predict(X_train)
y_val_pred_ensemble = ensemble_model.predict(X_val)
y_holdout_pred_ensemble = ensemble_model.predict(X_holdout)

train_rmse_ensemble = mean_squared_error(y_train, y_train_pred_ensemble, squared=False)
val_rmse_ensemble = mean_squared_error(y_val, y_val_pred_ensemble, squared=False)
holdout_rmse_ensemble = mean_squared_error(y_holdout, y_holdout_pred_ensemble, squared=False)

train_mae_ensemble = mean_absolute_error(y_train, y_train_pred_ensemble)
val_mae_ensemble = mean_absolute_error(y_val, y_val_pred_ensemble)
holdout_mae_ensemble = mean_absolute_error(y_holdout, y_holdout_pred_ensemble)

train_r2_ensemble = ensemble_model.score(X_train, y_train)
val_r2_ensemble = ensemble_model.score(X_val, y_val)
holdout_r2_ensemble = ensemble_model.score(X_holdout, y_holdout)

# Print results
print("Final Updated Ensemble Model Performance:")
print(f"Training RMSE: {train_rmse_ensemble:.4f}, MAE: {train_mae_ensemble:.4f}, R²: {train_r2_ensemble:.4f}")
print(f"Validation RMSE: {val_rmse_ensemble:.4f}, MAE: {val_mae_ensemble:.4f}, R²: {val_r2_ensemble:.4f}")
print(f"Holdout RMSE: {holdout_rmse_ensemble:.4f}, MAE: {holdout_mae_ensemble:.4f}, R²: {holdout_r2_ensemble:.4f}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001669 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 949
[LightGBM] [Info] Number of data points in the train set: 39523, number of used features: 63
[LightGBM] [Info] Start training from score 3927.968702




Final Updated Ensemble Model Performance:
Training RMSE: 9684.3373, MAE: 2905.4626, R²: 0.6602
Validation RMSE: 12238.6046, MAE: 3170.6019, R²: 0.5007
Holdout RMSE: 11830.6605, MAE: 3099.1856, R²: 0.6045


# Predictions

In [123]:
# Redefine dummy data for fitting
X_excluded = pred[[ 'Vodka', 'Store_Size_Extra Large',
          'Store_Size_Large', 'Store_Size_Medium', 'Store_Size_Small', 'Store_State_AZ', 'Store_State_CA',
          'Store_State_CO', 'Store_State_CT', 'Store_State_DE', 'Store_State_FL', 'Store_State_GA',
          'Store_State_IL', 'Store_State_IN', 'Store_State_KS', 'Store_State_KY', 'Store_State_LA',
          'Store_State_MA', 'Store_State_MD', 'Store_State_MI', 'Store_State_MN', 'Store_State_MO',
          'Store_State_NJ', 'Store_State_NM', 'Store_State_NV', 'Store_State_NY', 'Store_State_SC',
          'Store_State_TN', 'Store_State_TX', 'Store_State_WA', 'Store_State_WI', 'Package_Type_1.5L',
          'Package_Type_1.75L', 'Package_Type_1.75Lgft', 'Package_Type_100ml', 'Package_Type_1L',
          'Package_Type_200-3gft', 'Package_Type_200ml', 'Package_Type_375ml', 'Package_Type_700ml',
          'Package_Type_720ml', 'Package_Type_750gft', 'Package_Type_750ml', 'Count_Week_Instock_Normalized',
          'log_Retail_Price', 'log_Households', 'log_Store_Age_Days', 'Cluster_Label',
          'Vodka_Tequila_Under_65_Ratio', 'Vodka_Tequila_Over_65_Ratio', 'Vodka_Wine_Ratio',
          'Spirits_Direct', 'Flavored_Vodka', 'Top20_Vodka']]

# Predict with ensemble model
y_excluded_pred = ensemble_model.predict(X_excluded)

# Add predictions to the DataFrame
pred['Predicted_Sales'] = y_excluded_pred

# Save predictions to CSV
output_path = "df_excluded_predictions.csv"
pred[['Store_Number', 'Item_Code', 'Predicted_Sales']].to_csv(output_path, index=False)
output_path

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pred['Predicted_Sales'] = y_excluded_pred


'df_excluded_predictions.csv'

# ML

In [None]:
!pip install tensorflow



In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
# Define features and target
data = df_cleaned[df_cleaned['Sales Bucket'] == 'Sales included']
X = data[['Vodka', 'Store_Size_Extra Large',
          'Store_Size_Large', 'Store_Size_Medium', 'Store_Size_Small', 'Store_State_AZ', 'Store_State_CA',
          'Store_State_CO', 'Store_State_CT', 'Store_State_DE', 'Store_State_FL', 'Store_State_GA',
          'Store_State_IL', 'Store_State_IN', 'Store_State_KS', 'Store_State_KY', 'Store_State_LA',
          'Store_State_MA', 'Store_State_MD', 'Store_State_MI', 'Store_State_MN', 'Store_State_MO',
          'Store_State_NJ', 'Store_State_NM', 'Store_State_NV', 'Store_State_NY', 'Store_State_SC',
          'Store_State_TN', 'Store_State_TX', 'Store_State_WA', 'Store_State_WI', 'Package_Type_1.5L',
          'Package_Type_1.75L', 'Package_Type_1.75Lgft', 'Package_Type_100ml', 'Package_Type_1L',
          'Package_Type_200-3gft', 'Package_Type_200ml', 'Package_Type_375ml', 'Package_Type_700ml',
          'Package_Type_720ml', 'Package_Type_750gft', 'Package_Type_750ml', 'Count_Week_Instock_Normalized',
          'log_Retail_Price', 'log_Households', 'log_Store_Age_Days', 'Cluster_Label',
          'Vodka_Tequila_Under_65_Ratio', 'Vodka_Tequila_Over_65_Ratio', 'Vodka_Wine_Ratio',
          'Spirits_Direct', 'Flavored_Vodka', 'Top20_Vodka']]
y = data['Normalized_Sales_$L52W']

# Split into train, validation, and holdout sets
X_train_val, X_holdout, y_train_val, y_holdout = train_test_split(X, y, test_size=0.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

# Preprocessing pipeline
numerical_features = ['Vodka', 'Count_Week_Instock_Normalized', 'log_Retail_Price',
                      'log_Households', 'log_Store_Age_Days', 'Vodka_Tequila_Under_65_Ratio',
                      'Vodka_Tequila_Over_65_Ratio', 'Vodka_Wine_Ratio']
categorical_features = ['Store_Size_Extra Large', 'Store_Size_Large', 'Store_Size_Medium',
                        'Store_Size_Small', 'Store_State_AZ', 'Store_State_CA', 'Store_State_CO',
                        'Store_State_CT', 'Store_State_DE', 'Store_State_FL', 'Store_State_GA',
                        'Store_State_IL', 'Store_State_IN', 'Store_State_KS', 'Store_State_KY',
                        'Store_State_LA', 'Store_State_MA', 'Store_State_MD', 'Store_State_MI',
                        'Store_State_MN', 'Store_State_MO', 'Store_State_NJ', 'Store_State_NM',
                        'Store_State_NV', 'Store_State_NY', 'Store_State_SC', 'Store_State_TN',
                        'Store_State_TX', 'Store_State_WA', 'Store_State_WI', 'Package_Type_1.5L',
                        'Package_Type_1.75L', 'Package_Type_1.75Lgft', 'Package_Type_100ml',
                        'Package_Type_1L', 'Package_Type_200-3gft', 'Package_Type_200ml',
                        'Package_Type_375ml', 'Package_Type_700ml', 'Package_Type_720ml',
                        'Package_Type_750gft', 'Package_Type_750ml', 'Cluster_Label',
                        'Spirits_Direct', 'Flavored_Vodka', 'Top20_Vodka']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Preprocess data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_val_preprocessed = preprocessor.transform(X_val)
X_holdout_preprocessed = preprocessor.transform(X_holdout)

# Define the neural network model
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_preprocessed.shape[1],)),
    BatchNormalization(),
    Dropout(0.3),
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1)  # Single output for regression
])

# Compile the model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Train the model
history = model.fit(
    X_train_preprocessed, y_train,
    epochs=60,  # Increased epochs for better convergence
    batch_size=40,
    validation_data=(X_val_preprocessed, y_val),
    verbose=0
)

# Evaluate the model
y_train_pred_nn = model.predict(X_train_preprocessed).flatten()
y_val_pred_nn = model.predict(X_val_preprocessed).flatten()
y_holdout_pred_nn = model.predict(X_holdout_preprocessed).flatten()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1236/1236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
[1m309/309[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step


In [None]:
# Metrics
def evaluate_model_nn(y_true, y_pred, set_name):
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    mae = mean_absolute_error(y_true, y_pred)
    print(f"{set_name} RMSE: {rmse:.4f}")
    print(f"{set_name} MAE: {mae:.4f}")

print("Neural Network Model Performance:")
evaluate_model_nn(y_train, y_train_pred_nn, "Training")
evaluate_model_nn(y_val, y_val_pred_nn, "Validation")
evaluate_model_nn(y_holdout, y_holdout_pred_nn, "Holdout")

Neural Network Model Performance:
Training RMSE: 14600.0732
Training MAE: 3019.0694
Validation RMSE: 22907.4637
Validation MAE: 3493.4610
Holdout RMSE: 15644.8552
Holdout MAE: 3141.3507


