In [1]:
import pandas as pd
import numpy as np

np.random.seed(0)

# Load the store sales dataset
excel_file = pd.ExcelFile('data/store_sales.xlsx')
sheet_names = excel_file.sheet_names
features = ['F', 'D', 'Unit.Price']
target = 'Total.Volume'

sheet_names, len(sheet_names), features, target

(['StoreWeekSalescarbbev_modify',
  'StoreWeekSalescigets',
  'StoreWeekSalescoffee',
  'StoreWeekSalescoldcer',
  'StoreWeekSalesdeod',
  'StoreWeekSalesdiapers',
  'StoreWeekSalesfactiss',
  'StoreWeekSalesfzdinent',
  'StoreWeekSalesfzpizza',
  'StoreWeekSaleshotdog',
  'StoreWeekSaleslaundet',
  'StoreWeekSalesmargbutr',
  'StoreWeekSalesmayo',
  'StoreWeekSalesmustketc',
  'StoreWeekSalespaptowl',
  'StoreWeekSalespeanbutr',
  'StoreWeekSalesshamp',
  'StoreWeekSalessoup',
  'StoreWeekSalesspagsauc',
  'StoreWeekSalessugarsub',
  'StoreWeekSalestoitisu',
  'StoreWeekSalestoothpa',
  'StoreWeekSalesyogurt',
  'StoreWeekSalesbeer_modify'],
 24,
 ['F', 'D', 'Unit.Price'],
 'Total.Volume')

In [2]:
import pandas as pd
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
import time


def evaluate_model(model, X, y, n=1, aggregate=True):
    data = []

    for random_state in range(n):
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=random_state)

        start_time = time.time()
        model.fit(X_train, y_train)
        end_time = time.time()
        train_time = end_time - start_time

        start_time = time.time()
        predictions = model.predict(X_test)
        end_time = time.time()
        comp_time = end_time - start_time

        r2 = r2_score(y_test, predictions)
        mae = mean_absolute_error(y_test, predictions)
        mse = mean_squared_error(y_test, predictions)
        n_iter = model.n_iter_

        data.append([r2, mae, mse, train_time, comp_time, n_iter])

    df = pd.DataFrame(
        data, columns=['r2', 'mae', 'mse', 'train_time', 'comp_time', 'n_iter'])

    return df.mean() if aggregate else df

In [3]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import GridSearchCV


def best_estimator(model, param_grid, X, y):
    search = GridSearchCV(model, param_grid)
    search.fit(X, y)
    return search.best_estimator_

In [4]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor, StackingRegressor, VotingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, BayesianRidge, ARDRegression, SGDRegressor, PassiveAggressiveRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

from models_sklearn import MLP, FONN1, FONN2, TREENN1, TREENN2


def test_models(X, y):
    # Initialize standard models
    models = {
        # 'Linear Regression': LinearRegression(),
        # 'Ridge Regression': Ridge(),
        # 'Lasso Regression': Lasso(random_state=42),
        # 'ElasticNet Regression': ElasticNet(random_state=42),
        # 'Bayesian Ridge Regression': BayesianRidge(),
        # 'ARD Regression': ARDRegression(),
        # 'SGD Regressor': SGDRegressor(random_state=42),
        # 'Passive Aggressive Regressor': PassiveAggressiveRegressor(random_state=42),
        # 'Support Vector Regression': SVR(),
        # 'MLP Regressor': MLPRegressor(hidden_layer_sizes=(40,), max_iter=2000, learning_rate_init=1e-2, random_state=42, early_stopping=False),
        # 'Random Forest Regressor': RandomForestRegressor(n_estimators=100, random_state=42),
        # 'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=42),
        # 'XGBoost Regressor': XGBRegressor(random_state=42),
        # 'AdaBoost Regressor': AdaBoostRegressor(random_state=42),
        # 'Bagging Regressor': BaggingRegressor(random_state=42),
        # 'ExtraTrees Regressor': ExtraTreesRegressor(random_state=42),
        # 'HistGradientBoosting Regressor': HistGradientBoostingRegressor(random_state=42),
        # 'Stacking Regressor': StackingRegressor(estimators=[
        #     ('lr', LinearRegression()),
        #     ('rf', RandomForestRegressor(n_estimators=10, random_state=42))
        # ], final_estimator=Ridge(random_state=42)),
        # 'Voting Regressor': VotingRegressor(estimators=[
        #     ('lr', LinearRegression()),
        #     ('rf', RandomForestRegressor(n_estimators=10, random_state=42)),
        #     ('gb', GradientBoostingRegressor(random_state=42))
        # ])
    }

    param_grid = {
        'max_iter': [2000],
        'learning_rate': ['constant'],
        'learning_rate_init': [1e-2],
        'tol': [1e-4],
        'early_stopping': [True]
    }

    models['Custom MLP 5'] = best_estimator(MLP(5), param_grid, X, y)
    models['FONN1 5 5'] = best_estimator(FONN1(5, (10,)), param_grid, X, y)
    # models['FONN2 5 5'] = best_estimator(FONN2(5, (10,)), param_grid, X, y)
    # models['TREENN1 5'] = best_estimator(TREENN1((6,)), param_grid, X, y)
    # models['TREENN2 5'] = best_estimator(TREENN2((6,)), param_grid, X, y)
    # models['Custom MLP 10'] = best_estimator(MLP(10), param_grid, X, y)
    # models['FONN1 5 10'] = best_estimator(FONN1(5, (15,)), param_grid, X, y)
    # models['FONN2 5 10'] = best_estimator(FONN2(5, (15,)), param_grid, X, y)
    # models['TREENN1 10'] = best_estimator(TREENN1((11,)), param_grid, X, y)
    # models['TREENN2 10'] = best_estimator(TREENN2((11,)), param_grid, X, y)
    # models['Custom MLP 40'] = best_estimator(MLP(40), param_grid, X, y)
    # models['FONN1 5 40'] = best_estimator(FONN1(5, (45,)), param_grid, X, y)
    # models['FONN2 5 40'] = best_estimator(FONN2(5, (45,)), param_grid, X, y)
    # models['TREENN1 40'] = best_estimator(TREENN1((41,)), param_grid, X, y)
    # models['TREENN2 40'] = best_estimator(TREENN2((41,)), param_grid, X, y)
    # models['Custom MLP 100'] = best_estimator(MLP(100), param_grid, X, y)
    # models['FONN1 5 100'] = best_estimator(FONN1(5, (105,)), param_grid, X, y)
    # models['FONN2 5 100'] = best_estimator(FONN2(5, (105,)), param_grid, X, y)
    # models['TREENN1 100'] = best_estimator(TREENN1((101,)), param_grid, X, y)
    # models['TREENN2 100'] = best_estimator(TREENN2((101,)), param_grid, X, y)

    # Train and evaluate models
    results = []
    for name, model in models.items():
        result = evaluate_model(model, X, y, n=5).values
        results.append([name, *result])

    results_df = pd.DataFrame(
        results, columns=['model', 'r2', 'mae', 'mse', 'train_time', 'comp_time', 'n_iter'])
    results_df.set_index('model', inplace=True)

    return results_df

In [5]:
from sklearn.preprocessing import StandardScaler


def test_sheet(sheet_name):
    df = pd.read_excel(excel_file, sheet_name=sheet_name)
    iri_key_counts = df['IRI_KEY'].value_counts()
    iri_keys = list(map(int, iri_key_counts[iri_key_counts > 300].index))
    print(sheet_name, len(iri_keys))

    sheet_data = []

    for iri_key in iri_keys:
        df_iri = df[df['IRI_KEY'] == iri_key]
        X = df_iri[features]
        y = df_iri[target].values

        scaler_X = StandardScaler()
        X = scaler_X.fit_transform(X)
        scaler_y = StandardScaler()
        y = scaler_y.fit_transform(y.reshape(-1, 1)).ravel()

        results = test_models(X, y)
        results['sheet_name'] = sheet_name
        results['iri_key'] = iri_key
        sheet_data.append(results)

    return sheet_data

In [6]:
from sklearn.utils import shuffle

sheet_names = shuffle(sheet_names, random_state=0, n_samples=3)

dataset_mse = []

for sheet_name in sheet_names:  # type: ignore
    sheet_data = test_sheet(sheet_name)
    sheet_df = pd.DataFrame(sheet_data)
    
    dataset_mse.append(results)

dataset_mse = pd.concat(dataset_mse).T

dataset_mse.insert(0, 'Avg mse', dataset_mse.mean(axis=1))
dataset_mse.to_csv('output/dataset_mse.csv')
dataset_mse

StoreWeekSalesmargbutr 16


ValueError: Must pass 2-d input. shape=(16, 2, 8)

In [None]:
dataset_mse

[[                    r2       mae       mse  train_time  comp_time  n_iter  \
  model                                                                       
  Custom MLP 5  0.465393  0.466968  0.422170    0.017004   0.000073    55.2   
  FONN1 5 5     0.467567  0.455099  0.395207    0.020951   0.000415    49.2   
  
                            sheet_name  iri_key  
  model                                          
  Custom MLP 5  StoreWeekSalesmargbutr  6001821  
  FONN1 5 5     StoreWeekSalesmargbutr  6001821  ,
                      r2       mae       mse  train_time  comp_time  n_iter  \
  model                                                                       
  Custom MLP 5  0.509074  0.459405  0.414229    0.039197   0.000076   127.6   
  FONN1 5 5     0.107566  0.544422  0.832971    0.020536   0.000417    49.8   
  
                            sheet_name  iri_key  
  model                                          
  Custom MLP 5  StoreWeekSalesmargbutr   648368  
  FONN1 5 5

In [None]:
dataset_mse

NameError: name 'dataset_mse' is not defined

In [None]:
# import seaborn as sns
# import matplotlib.pyplot as plt
# import pandas as pd

# # Convert cv_results_ to a DataFrame
# results = pd.DataFrame(search.cv_results_)

# # Pivot the DataFrame to create a matrix for the heatmap
# heatmap_data = results.sort_values('iter').pivot_table(
#             index='param_learning_rate',
#             columns='param_epochs',
#             values='mean_test_score',
#             aggfunc='last',
#         )

# # Plot the heatmap
# plt.figure(figsize=(10, 6))
# sns.heatmap(heatmap_data, annot=True, cmap='viridis')
# plt.title('HalvingGridSearchCV Results')
# plt.xlabel('Number of Estimators')
# plt.ylabel('Learning Rate')
# plt.show()


In [None]:
results