In [1]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import torch
import utils
from datetime import timedelta, datetime
from chronos import ChronosPipeline
from darts import TimeSeries
from darts.dataprocessing.pipeline import Pipeline
from darts.models import TiDEModel
from darts.dataprocessing.transformers import Scaler
from darts.utils.timeseries_generation import datetime_attribute_timeseries
from darts.utils.likelihood_models import QuantileRegression
from darts.dataprocessing.transformers import StaticCovariatesTransformer, MissingValuesFiller

TIME_COL = "Date"
TARGET = "Weekly_Sales"
STATIC_COV = ["Store", "Dept", "Type", "Size"]
DYNAMIC_COV_FILL_0 = ["IsHoliday", 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']
DYNAMIC_COV_FILL_INTERPOLATE = ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment']
FREQ = "W-FRI"
SCALER = Scaler()
TRANSFORMER = StaticCovariatesTransformer()
PIPELINE = Pipeline([SCALER, TRANSFORMER])

### Load data

In [2]:
# load data and exogenous features
df = pd.read_csv('data/train.csv')
store_info = pd.read_csv('data/stores.csv')
exo_feat = pd.read_csv('data/features.csv').drop(columns='IsHoliday')

# join all data frames
df = pd.merge(df, store_info, on=['Store'], how='left')
df = pd.merge(df, exo_feat, on=['Store', TIME_COL], how='left')

# create unique id
df["unique_id"] = df['Store'].astype(str)+'-'+df['Dept'].astype(str)

print(f"Distinct number of time series: {len(df['unique_id'].unique())}")
df

Distinct number of time series: 3331


Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Type,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,unique_id
0,1,1,2010-02-05,24924.50,False,A,151315,42.31,2.572,,,,,,211.096358,8.106,1-1
1,1,1,2010-02-12,46039.49,True,A,151315,38.51,2.548,,,,,,211.242170,8.106,1-1
2,1,1,2010-02-19,41595.55,False,A,151315,39.93,2.514,,,,,,211.289143,8.106,1-1
3,1,1,2010-02-26,19403.54,False,A,151315,46.63,2.561,,,,,,211.319643,8.106,1-1
4,1,1,2010-03-05,21827.90,False,A,151315,46.50,2.625,,,,,,211.350143,8.106,1-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
421565,45,98,2012-09-28,508.37,False,B,118221,64.88,3.997,4556.61,20.64,1.50,1601.01,3288.25,192.013558,8.684,45-98
421566,45,98,2012-10-05,628.10,False,B,118221,64.89,3.985,5046.74,,18.82,2253.43,2340.01,192.170412,8.667,45-98
421567,45,98,2012-10-12,1061.02,False,B,118221,54.47,4.000,1956.28,,7.89,599.32,3990.54,192.327265,8.667,45-98
421568,45,98,2012-10-19,760.01,False,B,118221,56.47,3.969,2004.02,,3.18,437.73,1537.49,192.330854,8.667,45-98


### Preprocess data

In [3]:
df[TIME_COL] = pd.to_datetime(df[TIME_COL])
df[TARGET] = np.where(df[TARGET] < 0, 0, df[TARGET]) # remove negative values
df[['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4','MarkDown5']] = df[['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4','MarkDown5']].fillna(0) # fill missing values with nan
df["IsHoliday"] = df["IsHoliday"]*1 # convert boolean into binary
df["Size"] = np.where(df["Size"] < store_info["Size"].quantile(0.25), "small",
                np.where(df["Size"] > store_info["Size"].quantile(0.75), "large",
                "medium")) # make size a categorical variable

# reduce running time by forecasting only top 100 stores-depts
top_100_stores = df.groupby(['unique_id']).agg({TARGET: 'sum'}).reset_index().sort_values(by=TARGET, ascending=False).head(100)
df = df[df['unique_id'].isin(top_100_stores['unique_id'])]

print(f"Distinct number of time series: {len(df['unique_id'].unique())}")
df.head()

Distinct number of time series: 100


Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Type,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,unique_id
5118,1,38,2010-02-05,115564.35,0,A,medium,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,1-38
5119,1,38,2010-02-12,94136.35,1,A,medium,38.51,2.548,0.0,0.0,0.0,0.0,0.0,211.24217,8.106,1-38
5120,1,38,2010-02-19,98672.59,0,A,medium,39.93,2.514,0.0,0.0,0.0,0.0,0.0,211.289143,8.106,1-38
5121,1,38,2010-02-26,92755.59,0,A,medium,46.63,2.561,0.0,0.0,0.0,0.0,0.0,211.319643,8.106,1-38
5122,1,38,2010-03-05,108282.86,0,A,medium,46.5,2.625,0.0,0.0,0.0,0.0,0.0,211.350143,8.106,1-38


### Split data

In [4]:
train_start_date = datetime(2011, 1, 1)
train_end_date = datetime(2011, 12, 31)
test_start_date = datetime(2012, 1, 1)
FORECAST_HORIZON = 1

# Split the dataset into training and testing sets
train = df[(df[TIME_COL] >= train_start_date) & (df[TIME_COL] <= train_end_date)]
test = df[(df[TIME_COL] >= test_start_date) & (df[TIME_COL] < test_start_date + pd.Timedelta(weeks=FORECAST_HORIZON))]


# read train and test datasets and transform train dataset
train_darts = TimeSeries.from_group_dataframe(
      df=train,
      group_cols=STATIC_COV,
      time_col=TIME_COL,
      value_cols=TARGET,
      freq=FREQ,  
      fill_missing_dates=True,
      fillna_value=0)

# since we have several time series not all of them have the same number of weeks in the forecast set
print(f"Weeks for training: {len(train[TIME_COL].unique())} from {min(train[TIME_COL]).date()} to {max(train[TIME_COL]).date()}")
print(f"Weeks for testing: {len(test[TIME_COL].unique())} from {min(test[TIME_COL]).date()} to {max(test[TIME_COL]).date()}")

Weeks for training: 52 from 2011-01-07 to 2011-12-30
Weeks for testing: 2 from 2012-01-06 to 2012-01-13


### Forecasting

In [5]:
# Load the Chronos pipeline
pipeline_tiny = ChronosPipeline.from_pretrained(
    "amazon/chronos-t5-tiny",
    device_map="cpu",
    torch_dtype=torch.bfloat16,
)

# Run forecast
forecast_tiny = []
for ts in train_darts:
    # Tiny forecast
    lower, mid, upper = utils.chronos_forecast(pipeline_tiny, ts.pd_dataframe().reset_index(), FORECAST_HORIZON)
    forecast_tiny.append(utils.convert_forecast_to_pandas([lower, mid, upper], test[test['unique_id'] == str(int(list(ts.static_covariates_values())[0][0]))+'-'+str(int(list(ts.static_covariates_values())[0][1]))]))

# Convert list to data frames
forecast_tiny = pd.concat(forecast_tiny)
forecast_tiny

Unnamed: 0,unique_id,Date,forecast_lower,forecast,forecast_upper
5218,1-38,2012-01-06,65684.362500,73669.960938,83792.546875
5219,1-38,2012-01-13,63547.367188,67483.929688,76762.971094
9041,1-90,2012-01-06,75056.496094,85655.601562,98288.263281
9042,1-90,2012-01-13,84792.884375,92434.109375,102725.108594
9327,1-92,2012-01-06,126343.649219,141377.453125,164376.185938
...,...,...,...,...,...
397113,42-92,2012-01-13,101578.699219,105824.144531,112129.895313
403859,43-92,2012-01-06,74507.397656,84763.476562,88262.614844
403860,43-92,2012-01-13,85608.092188,91399.765625,97131.109375
404288,43-95,2012-01-06,56461.707422,66456.320312,74933.421875


### Evaluate forecast with residuals

In [6]:
residuals = pd.DataFrame(test[["unique_id", "Date"]])
residuals["residuals"] = test["Weekly_Sales"] - forecast_tiny["forecast"]
residuals

Unnamed: 0,unique_id,Date,residuals
5218,1-38,2012-01-06,17100.859063
5219,1-38,2012-01-13,6055.530313
9041,1-90,2012-01-06,5973.558438
9042,1-90,2012-01-13,75.760625
9327,1-92,2012-01-06,7405.376875
...,...,...,...
397113,42-92,2012-01-13,4844.255469
403859,43-92,2012-01-06,5428.093438
403860,43-92,2012-01-13,4853.794375
404288,43-95,2012-01-06,5351.319687


pipeline_large = ChronosPipeline.from_pretrained(
  "amazon/chronos-t5-large",
  device_map="cpu",
  torch_dtype=torch.bfloat16,
)

# run forecast
forecast_large = []
for ts in train_darts:
    # large
    lower, mid, upper = utils.chronos_forecast(pipeline_large, ts.pd_dataframe().reset_index(), FORECAST_HORIZON)
    forecast_large.append(utils.convert_forecast_to_pandas([lower, mid, upper], test[test['unique_id'] == str(int(list(ts.static_covariates_values())[0][0]))+'-'+str(int(list(ts.static_covariates_values())[0][1]))]))

# convert list to data frames
forecast_large = pd.concat(forecast_large)

# Get series ordered by volume in a descending way
series = test.groupby('unique_id')[TARGET].sum().reset_index().sort_values(by=TARGET, ascending=False)['unique_id'].tolist()

for ts in series[:2]:
    utils.plot_actuals_forecast(df[df["unique_id"]==ts], forecast_tiny[forecast_tiny["unique_id"] == ts], ts)
    # utils.plot_actuals_forecast(df[df["unique_id"]==ts], forecast_large[forecast_large["unique_id"] == ts], ts)
