In [1]:
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, BayesianRidge, ARDRegression, SGDRegressor, PassiveAggressiveRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor, StackingRegressor, VotingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

from models import MLP, Ensemble, FONN1, FONN2, TREENN1, TREENN2

np.random.seed(0)

In [2]:
# Load the store sales dataset
excel_file = pd.ExcelFile('data/store_sales.xlsx')
sheet_names = excel_file.sheet_names

# Read the data
df = pd.read_excel(excel_file, sheet_name=sheet_names[0])
iri_key_counts = df["IRI_KEY"].value_counts()
iri_keys = iri_key_counts[iri_key_counts > 300].index


features = ["F", "D", "Unit.Price"]
target = "Total.Volume"

df = df[df["IRI_KEY"] == iri_keys[0]]
X = df[features]
y = df[target].values.reshape(-1, 1) # type: ignore

scaler_X = StandardScaler()
X = scaler_X.fit_transform(X)
scaler_y = StandardScaler()
y = scaler_y.fit_transform(y)

sheet_names, iri_keys, X.shape, y.shape

(['StoreWeekSalescarbbev_modify',
  'StoreWeekSalescigets',
  'StoreWeekSalescoffee',
  'StoreWeekSalescoldcer',
  'StoreWeekSalesdeod',
  'StoreWeekSalesdiapers',
  'StoreWeekSalesfactiss',
  'StoreWeekSalesfzdinent',
  'StoreWeekSalesfzpizza',
  'StoreWeekSaleshotdog',
  'StoreWeekSaleslaundet',
  'StoreWeekSalesmargbutr',
  'StoreWeekSalesmayo',
  'StoreWeekSalesmustketc',
  'StoreWeekSalespaptowl',
  'StoreWeekSalespeanbutr',
  'StoreWeekSalesshamp',
  'StoreWeekSalessoup',
  'StoreWeekSalesspagsauc',
  'StoreWeekSalessugarsub',
  'StoreWeekSalestoitisu',
  'StoreWeekSalestoothpa',
  'StoreWeekSalesyogurt',
  'StoreWeekSalesbeer_modify'],
 Index([6001821.0,  648368.0,  279300.0,  400003.0,  270862.0,  231720.0,
         252570.0,  659827.0,  241565.0,  237277.0,  291276.0,  273920.0,
         232633.0,  233246.0,  532639.0,  533864.0],
       dtype='float64', name='IRI_KEY'),
 (313, 3),
 (313, 1))

In [3]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((250, 3), (63, 3), (250, 1), (63, 1))

In [4]:
# Function to train and evaluate a model
def train_evaluate_model(model, X_train, X_test, y_train, y_test):
    start_time = time.time()
    model.fit(X_train, y_train)
    end_time = time.time()
    train_time = end_time - start_time

    start_time = time.time()
    predictions = model.predict(X_test)
    end_time = time.time()
    comp_time = end_time - start_time

    r2 = r2_score(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)

    return r2, mae, mse, train_time, comp_time


# Initialize standard models
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "ElasticNet Regression": ElasticNet(),
    "Bayesian Ridge Regression": BayesianRidge(),
    "ARD Regression": ARDRegression(),
    "SGD Regressor": SGDRegressor(),
    "Passive Aggressive Regressor": PassiveAggressiveRegressor(),
    "Support Vector Regression": SVR(),
    "MLP Regressor": MLPRegressor(hidden_layer_sizes=(100,), max_iter=10000, random_state=42),
    "Random Forest Regressor": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting Regressor": GradientBoostingRegressor(random_state=42),
    "XGBoost Regressor": XGBRegressor(random_state=42),
    "AdaBoost Regressor": AdaBoostRegressor(random_state=42),
    "Bagging Regressor": BaggingRegressor(random_state=42),
    "ExtraTrees Regressor": ExtraTreesRegressor(random_state=42),
    "HistGradientBoosting Regressor": HistGradientBoostingRegressor(random_state=42),
    "Stacking Regressor": StackingRegressor(estimators=[
        ('lr', LinearRegression()),
        ('rf', RandomForestRegressor(n_estimators=10, random_state=42))
    ], final_estimator=Ridge()),
    "Voting Regressor": VotingRegressor(estimators=[
        ('lr', LinearRegression()),
        ('rf', RandomForestRegressor(n_estimators=10, random_state=42)),
        ('gb', GradientBoostingRegressor(random_state=42))
    ])
}


In [5]:
# Initialize custom models
input_dim = X_train.shape[1]
hidden_dim = 10
output_dim = 1
batch_size = 32
learning_rate = 0.0001
epochs = 50000

# models["Custom MLP"] = MLP(input_dim, hidden_dim, output_dim,
#                            batch_size=batch_size, learning_rate=learning_rate, epochs=epochs)
# #    learning_rate=0.01, epochs=1000)

# num_trees_input = 10
# models["FONN1"] = FONN1(input_dim, hidden_dim, output_dim, num_trees_input,
#                         batch_size=batch_size, learning_rate=learning_rate, epochs=epochs)
# # learning_rate=0.01, epochs=1000)
# models["Tree-based Predictions (FONN1)"] = models["FONN1"].trees

# models["TREENN1"] = TREENN1(input_dim, hidden_dim, output_dim,
#                             batch_size=batch_size, learning_rate=learning_rate, epochs=epochs)
# # learning_rate=0.01, epochs=40000)
# models["Tree-based Predictions (TREENN1)"] = models["TREENN1"].trees

# num_trees_hidden = 10
# models["FONN2"] = FONN2(input_dim, hidden_dim, output_dim, num_trees_hidden,
#                         batch_size=batch_size, learning_rate=learning_rate, epochs=epochs)
# # learning_rate=0.01, epochs=epochs)
# models["Tree-based Predictions (FONN2)"] = models["FONN2"].trees

# models["TREENN2"] = TREENN2(input_dim, hidden_dim, output_dim,
#                             batch_size=batch_size, learning_rate=learning_rate, epochs=epochs)
# # learning_rate=0.01, epochs=epochs)
# models["Tree-based Predictions (TREENN2)"] = models["TREENN2"].trees

# models["Ensemble of 10 Trees"] = Ensemble(10)

In [6]:
# Train and evaluate models
results = {}
for name, model in models.items():
    print(name)
    r2, mae, mse, fit_time, comp_time = train_evaluate_model(
        model, X_train, X_test, y_train, y_test)
    results[name] = {"R² Score": r2, "MAE": mae, "MSE": mse,
                     "Train Time (s)": fit_time, "Comp Time (s)": comp_time}

# Convert results to a DataFrame for better visualization
results_df = pd.DataFrame(results).T
results_df

Linear Regression
Ridge Regression
Lasso Regression
ElasticNet Regression
Bayesian Ridge Regression
ARD Regression
SGD Regressor
Passive Aggressive Regressor
Support Vector Regression
MLP Regressor
Random Forest Regressor


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)


Gradient Boosting Regressor
XGBoost Regressor
AdaBoost Regressor
Bagging Regressor
ExtraTrees Regressor


  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)


HistGradientBoosting Regressor
Stacking Regressor
Voting Regressor


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Unnamed: 0,R² Score,MAE,MSE,Train Time (s),Comp Time (s)
Linear Regression,0.321274,0.532588,0.433517,0.007542,0.000182
Ridge Regression,0.321294,0.532445,0.433504,0.001237,0.000109
Lasso Regression,-0.01799,0.639909,0.650211,0.000523,8.7e-05
ElasticNet Regression,-0.01799,0.639909,0.650211,0.000478,6.8e-05
Bayesian Ridge Regression,0.321181,0.531217,0.433576,0.001394,0.00011
ARD Regression,0.311469,0.536107,0.439779,0.001498,0.000107
SGD Regressor,0.318295,0.53389,0.435419,0.001062,0.000194
Passive Aggressive Regressor,-2.884998,1.180568,2.481429,0.000637,0.000103
Support Vector Regression,0.562097,0.42547,0.279698,0.003708,0.000793
MLP Regressor,0.476447,0.473188,0.334404,0.115273,0.00015


In [7]:
# Get and print tree importances
# tree_importances = models["FONN2"].trees.get_tree_importances()

In [8]:
def test_models(X, y):
    scaler_X = StandardScaler()
    X = scaler_X.fit_transform(X)
    scaler_y = StandardScaler()
    y = scaler_y.fit_transform(y)

    # Split the dataset
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42)

    # Initialize standard models
    models = {
        "Linear Regression": LinearRegression(),
        "Ridge Regression": Ridge(),
        "Lasso Regression": Lasso(),
        "ElasticNet Regression": ElasticNet(),
        "Bayesian Ridge Regression": BayesianRidge(),
        "ARD Regression": ARDRegression(),
        "SGD Regressor": SGDRegressor(),
        "Passive Aggressive Regressor": PassiveAggressiveRegressor(),
        "Support Vector Regression": SVR(),
        "MLP Regressor": MLPRegressor(hidden_layer_sizes=(100,), max_iter=10000, random_state=42),
        "Random Forest Regressor": RandomForestRegressor(n_estimators=100, random_state=42),
        "Gradient Boosting Regressor": GradientBoostingRegressor(random_state=42),
        "XGBoost Regressor": XGBRegressor(random_state=42),
        "AdaBoost Regressor": AdaBoostRegressor(random_state=42),
        "Bagging Regressor": BaggingRegressor(random_state=42),
        "ExtraTrees Regressor": ExtraTreesRegressor(random_state=42),
        "HistGradientBoosting Regressor": HistGradientBoostingRegressor(random_state=42),
        "Stacking Regressor": StackingRegressor(estimators=[
            ('lr', LinearRegression()),
            ('rf', RandomForestRegressor(n_estimators=10, random_state=42))
        ], final_estimator=Ridge()),
        "Voting Regressor": VotingRegressor(estimators=[
            ('lr', LinearRegression()),
            ('rf', RandomForestRegressor(n_estimators=10, random_state=42)),
            ('gb', GradientBoostingRegressor(random_state=42))
        ])
    }

    # Initialize custom models
    input_dim = X_train.shape[1]
    hidden_dim = 10
    output_dim = 1
    batch_size = 32
    learning_rate = 0.0001
    epochs = 50000

    models["Custom MLP"] = MLP(input_dim, hidden_dim, output_dim,
                               batch_size=batch_size, learning_rate=learning_rate, epochs=epochs)
    #    learning_rate=0.01, epochs=1000)

    num_trees_input = 10
    models["FONN1"] = FONN1(input_dim, hidden_dim, output_dim, num_trees_input,
                            batch_size=batch_size, learning_rate=learning_rate, epochs=epochs)
    # learning_rate=0.01, epochs=1000)
    models["Tree-based Predictions (FONN1)"] = models["FONN1"].trees

    models["TREENN1"] = TREENN1(input_dim, hidden_dim, output_dim,
                                batch_size=batch_size, learning_rate=learning_rate, epochs=epochs)
    # learning_rate=0.01, epochs=40000)
    models["Tree-based Predictions (TREENN1)"] = models["TREENN1"].trees

    num_trees_hidden = 10
    models["FONN2"] = FONN2(input_dim, hidden_dim, output_dim, num_trees_hidden,
                            batch_size=batch_size, learning_rate=learning_rate, epochs=epochs)
    # learning_rate=0.01, epochs=epochs)
    models["Tree-based Predictions (FONN2)"] = models["FONN2"].trees

    models["TREENN2"] = TREENN2(input_dim, hidden_dim, output_dim,
                                batch_size=batch_size, learning_rate=learning_rate, epochs=epochs)
    # learning_rate=0.01, epochs=epochs)
    models["Tree-based Predictions (TREENN2)"] = models["TREENN2"].trees

    models["Ensemble of 10 Trees"] = Ensemble(10)

    # Train and evaluate models
    results = {}
    for name, model in models.items():
        r2, mae, mse, fit_time, comp_time = train_evaluate_model(
            model, X_train, X_test, y_train, y_test)
        results[name] = {"R² Score": r2, "MAE": mae, "MSE": mse,
                         "Train Time (s)": fit_time, "Comp Time (s)": comp_time}

    # Convert results to a DataFrame for better visualization
    results_df = pd.DataFrame(results).T
    return results_df

def test_sheets():
    for sheet_name in sheet_names:
        print(sheet_name)
        df = pd.read_excel(excel_file, sheet_name=sheet_name)
        iri_key_counts = df["IRI_KEY"].value_counts()
        iri_keys = iri_key_counts[iri_key_counts > 300].index

        for iri_key in iri_keys:
            df_iri = df[df["IRI_KEY"] == iri_key]
            X = df_iri[features]
            y = df_iri[target].values.reshape(-1, 1)

            print(sheet_name, iri_key, X.shape, y.shape)

            results = test_models(X, y)
            results.to_csv(f"output/{str(sheet_name)+'_'+str(iri_key)}.csv")

In [9]:
from sklearn.model_selection import HalvingGridSearchCV

def search_params(model, X, y):
    param_grid = {
        'learning_rate': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1],
        'epochs': [1000, 5000, 10000, 50000]
    }
    search = HalvingGridSearchCV(model, param_grid, random_state=42).fit(X, y)
    return search.best_params_


input_dim = X_train.shape[1]
hidden_dim = 10
output_dim = 1
batch_size = 32
# learning_rate = 0.0001
# epochs = 50000

param_grid = {
    'learning_rate': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1],
    'epochs': [1000, 5000, 10000, 50000]
}

mlp = MLP(input_dim, hidden_dim, output_dim,
          batch_size=batch_size)
# mlp.__init__.__code__.co_varnames[1:]
# search_params(mlp, X_train, y_train)
search = HalvingGridSearchCV(mlp, param_grid, random_state=42)
search.fit(X_train, y_train)

ImportError: HalvingGridSearchCV is experimental and the API might change without any deprecation cycle. To use it, you need to explicitly import enable_halving_search_cv:
from sklearn.experimental import enable_halving_search_cv