<h1 style="text-align:center">Build and Evaluate Time Series Models</h1>
<h2 style="text-align:center">Sales Forecasting- ETS</h2>
<p style="text-align:center">Robert Evans</p>
<p style="text-align:center">School of Technology & Engineering, National University</p>
<p style="text-align:center">DDS-8555: Predictive Analysis</p>
<p style="text-align:center">Dr. Mohammad Yavarimanesh</p>
<p style="text-align:center">March 2, 2025</p>

## Load Required Libraries

In [8]:
import pandas as pd
import numpy as np
from statsmodels.tsa.exponential_smoothing.ets import ETSModel
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tqdm import tqdm
import warnings

# Suppress warnings
warnings.filterwarnings("ignore")

## Import Data

In [9]:
kaggle = False

if kaggle:
    path = ""
else:
    path = "Week8_Data_Sales"

train = pd.read_csv(f"{path}/train.csv", parse_dates= ['date'])
test = pd.read_csv(f"{path}/test.csv", parse_dates= ['date'])

## Prepare the data

In [10]:
# Aggregate sales at daily level per store and family
train_grouped = train.groupby(["store_nbr", "family", "date"]).agg({"sales": "sum", "onpromotion": "sum"}).reset_index()

# Ensure datetime index for time series forecasting
train_grouped["date"] = pd.to_datetime(train_grouped["date"])

In [11]:
# Temporal train-test split (80% train, 20% test based on time)
split_date = train_grouped["date"].quantile(0.8)  # 80% cutoff date
train_set = train_grouped[train_grouped["date"] <= split_date]
test_set = train_grouped[train_grouped["date"] > split_date]

In [12]:
train_set

Unnamed: 0,store_nbr,family,date,sales,onpromotion
0,1,AUTOMOTIVE,2013-01-01,0.0,0
1,1,AUTOMOTIVE,2013-01-02,2.0,0
2,1,AUTOMOTIVE,2013-01-03,3.0,0
3,1,AUTOMOTIVE,2013-01-04,3.0,0
4,1,AUTOMOTIVE,2013-01-05,5.0,0
...,...,...,...,...,...
3000547,54,SEAFOOD,2016-09-08,0.0,0
3000548,54,SEAFOOD,2016-09-09,0.0,0
3000549,54,SEAFOOD,2016-09-10,4.0,1
3000550,54,SEAFOOD,2016-09-11,0.0,0


## Build The Model

In [13]:
# Fit ETS Model for each store-family combination with progress bar
results = []
store_family_groups = train_set.groupby(["store_nbr", "family"])

with tqdm(total=len(store_family_groups), desc="Training ETS Models") as pbar:
    for (store, family), df in store_family_groups:
        df = df.set_index("date").sort_index()

        # Ensure the time series has a regular frequency
        df = df.asfreq("D").fillna(method="ffill").fillna(0)  # Fill missing dates and handle NaNs

        test_df = test_set[(test_set["store_nbr"] == store) & (test_set["family"] == family)]
        if test_df.empty:
            print(f"Skipping {store}, {family} - No test data available.")
            pbar.update(1)
            continue

        try:
            # Initial model attempt
            model = ETSModel(df["sales"], error="add", trend="add", seasonal=None)
            fit = model.fit(maxiter=1000, disp=False)

        except Exception as e:
            print(f"Retrying {store}, {family} with damped trend due to: {e}")
            try:
                # Retry with damped trend
                model = ETSModel(df["sales"], error="add", trend="add", damped_trend=True, seasonal=None)
                fit = model.fit(maxiter=1000, disp=False)
            except Exception as e:
                print(f"Skipping {store}, {family} after second failure: {e}")
                pbar.update(1)
                continue

        test_df = test_df.set_index("date").sort_index()
        test_df = test_df.asfreq("D").fillna(method="ffill").fillna(0)  # Ensure aligned test set

        preds = fit.forecast(steps=len(test_df))  # Forecast for test period

        # Ensure actual values are not empty
        if len(test_df["sales"].values) == 0 or len(preds) == 0:
            print(f"Skipping {store}, {family} - Empty prediction or actual values.")
            pbar.update(1)
            continue

        # Evaluate model
        mae = mean_absolute_error(test_df["sales"].values, preds)
        rmse = np.sqrt(mean_squared_error(test_df["sales"].values, preds))

        results.append([store, family, mae, rmse])

        pbar.update(1)  # Update progress bar

Training ETS Models: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1782/1782 [03:38<00:00,  8.16it/s]


In [14]:
# Convert evaluation results to DataFrame
metrics_df = pd.DataFrame(results, columns=["store_nbr", "family", "MAE", "RMSE"])

In [15]:
metrics_df

Unnamed: 0,store_nbr,family,MAE,RMSE
0,1,AUTOMOTIVE,2.336223,2.858969
1,1,BABY CARE,0.000000,0.000000
2,1,BEAUTY,1.964754,2.361285
3,1,BEVERAGES,467.857558,615.965568
4,1,BOOKS,0.649852,1.306228
...,...,...,...,...
1777,54,POULTRY,18.211490,25.656170
1778,54,PREPARED FOODS,23.476708,28.675250
1779,54,PRODUCE,160.131182,196.556477
1780,54,SCHOOL AND OFFICE SUPPLIES,1.816463,5.860568


## Kaggle

In [16]:
test = test.sort_values(by=["store_nbr", "family", "date"])

# Ensure datetime index for prediction
test["date"] = pd.to_datetime(test["date"])

In [17]:
# Predict sales for test dataset with progress bar
predictions = []
test_groups = test.groupby(["store_nbr", "family"])

with tqdm(total=len(test_groups), desc="Predicting Sales") as pbar:
    for (store, family), df in test_groups:
        df = df.set_index("date").sort_index()
        df = df.asfreq("D").fillna(method="ffill").fillna(0)  # Ensure daily frequency in test data

        try:
            # Fit model on full training data
            train_series = train_set[(train_set["store_nbr"] == store) & (train_set["family"] == family)]
            if train_series.empty:
                print(f"Skipping {store}, {family} - No training data available.")
                pbar.update(1)
                continue

            train_series = train_series.set_index("date").sort_index()
            train_series = train_series.asfreq("D").fillna(method="ffill").fillna(0)  # Ensure a regular time series

            model = ETSModel(train_series["sales"], error="add", trend="add", seasonal=None)
            fit = model.fit(maxiter=1000, disp=False)

            if len(df) == 0:
                print(f"Skipping {store}, {family} - No test data for forecasting.")
                pbar.update(1)
                continue

            preds = fit.forecast(len(df))  # Forecast for test period
            df["sales"] = preds.values
            df = df.reset_index()

            predictions.append(df[["id", "sales"]])

        except Exception as e:
            print(f"Skipping {store}, {family} due to error: {e}")

        pbar.update(1)  # Update progress bar

Predicting Sales: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1782/1782 [07:36<00:00,  3.91it/s]


In [18]:
# Save predictions to CSV
if predictions:
    predictions_df = pd.concat(predictions)
    predictions_df.to_csv("sales_predictions_ets.csv", index=False)
else:
    print("No predictions were generated.")

In [24]:
metrics_df.to_csv("sales_predictions_ets_metrics.csv", index=False)