In [81]:
import pandas as pd
import warnings
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from skforecast.ForecasterAutoreg import ForecasterAutoreg
# import matplotlib.pyplot as plt
from skforecast.model_selection import backtesting_forecaster

warnings.filterwarnings("ignore")
pd.options.display.max_columns=None

In [3]:
df = pd.read_csv("retailer_sample.csv")
df.columns = (
    df.columns
    .str.lower()
    .str.replace(".", "_")
    .str.replace(" ", "_")
    .str.replace("__", "_")
)

months = {
    "January": "01", "February": "02",
    "March": "03", "April": "04",
    "May": "05", "June": "06",
    "July": "07", "August": "08",
    "September": "09", "October": "10",
    "November": "11", "December": "12",
}
df["month_num"] = df["month"].replace(months)
df["year"] = df["year"].astype(str)

df["date"] = df["year"] + "-" + df["month_num"]
df["date"] = pd.to_datetime(df["date"], format="%Y-%m")

print(df.shape)
df.head(2)

(256853, 13)


Unnamed: 0,year,month,segment,subsegment,brand,system,province,retailer,sales_units,th_sales_value_usd,th_sales_value_ars,month_num,date
0,2017,January,A,1,BRAND-1,Old,BUENOS AIRES,F,912.645,483.412463,7702.77226,1,2017-01-01
1,2017,February,A,1,BRAND-1,Old,BUENOS AIRES,F,786.42,413.161071,6511.058545,2,2017-02-01


In [50]:
tot_retailer_date = (
    df
    .groupby(["retailer", "date"])
    .agg(
        tot_sales_value_usd = ("th_sales_value_usd", "sum")
    )
    .reset_index()
)
tot_retailer_date = tot_retailer_date.set_index("date")
# tot_retailer_date["retailer_F"] = (tot_retailer_date["retailer"]=="F").astype(int)
# tot_retailer_date = tot_retailer_date.drop(columns=["retailer"])

print(tot_retailer_date.shape)
tot_retailer_date.head()

(100, 2)


Unnamed: 0_level_0,retailer,tot_sales_value_usd
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-01-01,F,95288.165732
2017-02-01,F,58031.171353
2017-03-01,F,50660.782042
2017-04-01,F,44720.596696
2017-05-01,F,56717.546913


In [51]:
tot_retailer_date.index.max()

Timestamp('2021-02-01 00:00:00')

In [90]:
def train_eval_model(
    time_series_data: pd.DataFrame,
    model
):

    target = "tot_sales_value_usd"

    for retailer in time_series_data["retailer"].unique():

        print("="*40)
        print(f"Retailer: {retailer}")

        data = time_series_data.query("retailer==@retailer").loc['2017-01-01': '2021-02-01'].copy()
        end_train = '2020-10-01'
        start_test = '2020-11-01'
        data_train = data.loc[:end_train, :].copy()
        data_test  = data.loc[start_test:, :].copy()

        print(f"Train dates : {data_train.index.min()} --- {data_train.index.max()}  (n={len(data_train)})")
        print(f"Test dates  : {data_test.index.min()} --- {data_test.index.max()}  (n={len(data_test)})")

        # fit model
        data = data.reset_index(drop=True)
        forecaster = ForecasterAutoreg(
            regressor     = model,
            lags          = 12,
            transformer_y = StandardScaler(),
        )
        forecaster.fit(y=data_train[target])
        
        # Backtest final model
        # ==============================================================================
        metric, predictions = backtesting_forecaster(
            forecaster         = forecaster,
            y                  = data[target],
            steps              = len(data_test),
            metric             = "mean_absolute_percentage_error",
            initial_train_size = len(data_train),
            refit              = False,
            n_jobs             = 'auto',
            verbose            = False,
            show_progress      = False
        )
        print(f"Error: {metric}")

In [91]:
train_eval_model(
    time_series_data=tot_retailer_date,
    model=Ridge(random_state=123)
)

Retailer: F
Train dates : 2017-01-01 00:00:00 --- 2020-10-01 00:00:00  (n=46)
Test dates  : 2020-11-01 00:00:00 --- 2021-02-01 00:00:00  (n=4)
Error: 0.2621614974321394
Retailer: O
Train dates : 2017-01-01 00:00:00 --- 2020-10-01 00:00:00  (n=46)
Test dates  : 2020-11-01 00:00:00 --- 2021-02-01 00:00:00  (n=4)
Error: 0.21491564433176663


In [92]:
train_eval_model(
    time_series_data=tot_retailer_date,
    model=RandomForestRegressor(random_state=123)
)

Retailer: F
Train dates : 2017-01-01 00:00:00 --- 2020-10-01 00:00:00  (n=46)
Test dates  : 2020-11-01 00:00:00 --- 2021-02-01 00:00:00  (n=4)
Error: 0.21607445793986693
Retailer: O
Train dates : 2017-01-01 00:00:00 --- 2020-10-01 00:00:00  (n=46)
Test dates  : 2020-11-01 00:00:00 --- 2021-02-01 00:00:00  (n=4)
Error: 0.11515398085009805


---

In [85]:
# Predictors importance
# ==============================================================================
# forecaster.get_feature_importances().sort_values("importance",ascending=False)