In [None]:
!pip install darts

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from darts import TimeSeries
from darts.models import Prophet
from darts.metrics import smape

import matplotlib.pyplot as plt

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

This notebook will use Prophet model. Please note that I have no idea what I'm doing, and I'm just fooling around, so if I'm doing anything that seems dumb, please notify me in the comments.

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/train.csv', parse_dates=['date'])
test = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/test.csv', parse_dates=['date'])

I'll concatenate the train and test data in order to dissect them into different groups based on country, store and product for time series prediction, then I'll extract the test prediction later and concatenate them for submission.

In [None]:
train_len = train.shape[0]
test_len = test.shape[0]

train_len, test_len

In [None]:
test.date.plot();

In [None]:
train.date.plot();

The test set is one year, so I'll make a validation set that is also one year starting from 2018.

I'll make a pipeline for prediciton using the validation set, then I'll apply the same pipeline on the test after training using the entire training set.

In [None]:
dev = train.query('date < "2018-01-01"')
val = train.query('date >= "2018-01-01"')

In [None]:
dev.date.plot(label='Dev set');
val.date.plot(label='Val set');
plt.legend();

Now I'll group the dev set by country, store and product and fit the model for each group.

In [None]:
plt.figure(figsize=(20, 10))
dev.groupby(['country', 'store', 'product']).num_sold.plot();
plt.legend();

In [None]:
dev_groups = dev.groupby(['country', 'store', 'product'])
val_groups = val.groupby(['country', 'store', 'product'])
models = {}

print('Training')
for group, df in dev_groups:
    dev_series = TimeSeries.from_dataframe(df, time_col='date', value_cols='num_sold')    
    prophet = Prophet()
    prophet.fit(dev_series)
    models[group] = prophet
    

print('Evaluation')
for group, df in val_groups:
    val_series = TimeSeries.from_dataframe(df, time_col='date', value_cols='num_sold')
    prophet = models[group]
    forecast = prophet.predict(len(val_series))
    print(group, 'SMAPE:', smape(forecast, val_series))

It's working, so let's add holidays.

In [None]:
import holidays
import dateutil.easter as easter

holiday_list = []

for date in holidays.Finland(years=[2014, 2015, 2016, 2017, 2018, 2019, 2020], observed=True).items():
    holiday_list.append([date[0], date[1], "Finland"])
    
for date in holidays.Norway(years=[2014, 2015, 2016, 2017, 2018, 2019, 2020], observed=True).items():
    holiday_list.append([date[0], date[1], "Norway"])
    
for date in holidays.Sweden(years=[2014, 2015, 2016, 2017, 2018, 2019, 2020], observed=True).items():
    if date[1]!='Söndag':
        holiday_list.append([date[0], date[1].replace(", Söndag", ""), "Sweden"])
    
    
# Last week of the year
for year in [2014, 2015, 2016, 2017, 2018, 2019, 2020]:
    for i, day in enumerate(range(24, 32)):
        for country in ['Finland', 'Sweden', 'Norway']:
             holiday_list.append([pd.to_datetime(f"{year}-{12}-{day}").date(), 
                                  f"Last week of the year (day {i+1})", 
                                  country])
# Swedish Rock Concert
for start, end, year in [[4,7,2014],[3,6,2015],[8,11,2016],[7,10,2017],[6,10,2018],[5,8,2019]]:
    for i, day in enumerate(range(start, end+1)):
        holiday_list.append([pd.to_datetime(f"{year}-{6}-{day}").date(), 
                                  f"Swedish Rock Concert (day {i+1})", 
                                  "Sweden"])
        
# Last Wednesday of June
for date in ['2014-06-25', '2015-06-24', '2016-06-29', '2017-06-28', '2018-06-27', '2019-06-26', '2020-06-24']:
    for country in ['Finland', 'Sweden', 'Norway']:
         holiday_list.append([pd.to_datetime(date).date(), 
                                  f"Last Wednesday of June", 
                                  country])
            
# First Sunday of November
for date in ['2014-11-02', '2015-11-1', '2016-11-6', '2017-11-5', '2018-11-4', '2019-11-3', '2020-11-01']:
    for country in ['Finland', 'Sweden', 'Norway']:
         holiday_list.append([pd.to_datetime(date).date(), 
                                  f"First Sunday of November", 
                                  country])
            
# Independence Day of Finland
for year in [2014, 2015, 2016, 2017, 2018, 2019, 2020]:
    holiday_list.append([pd.to_datetime(f"{year}-{12}-{6}").date(), 
                                      f"Independence Day of Finland", 
                                      'Finland'])

# Easter
easter_date = [easter.easter(y) for y in [2014, 2015, 2016, 2017, 2018, 2019, 2020]]
for date in easter_date:
    for country in ['Finland', 'Sweden', 'Norway']:
         holiday_list.append([pd.to_datetime(date).date(), 
                                  f"Easter", 
                                  country])
            


holidays = pd.DataFrame(holiday_list, columns=['ds', 'holiday', 'country'])
holidays = holidays.drop_duplicates(['ds', 'country'], keep='first')
holidays = holidays.sort_values(['ds', 'country'])
holidays['ds'] = pd.to_datetime(holidays['ds'])

In [None]:
dev_groups = dev.groupby(['country', 'store', 'product'])
val_groups = val.groupby(['country', 'store', 'product'])
models = {}

print('Training')
for group, df in dev_groups:
    country = group[0]
    country_holidays = holidays.query('country == @country').drop('country', axis=1)
    dev_series = TimeSeries.from_dataframe(df, time_col='date', value_cols='num_sold')    
    prophet = Prophet(holidays=country_holidays)
    prophet.fit(dev_series)
    models[group] = prophet
    

print('Evaluation')
for group, df in val_groups:
    val_series = TimeSeries.from_dataframe(df, time_col='date', value_cols='num_sold')
    prophet = models[group]
    forecast = prophet.predict(len(val_series))
    print(group, 'SMAPE:', smape(forecast, val_series))

Now let's make put these into predictions into a dataframe.

In [None]:
val_groups = val.groupby(['country', 'store', 'product'])
val_pred = pd.DataFrame()

for group, df in val_groups:
    val_series = TimeSeries.from_dataframe(df, time_col='date', value_cols='num_sold')
    prophet = models[group]
    forecast = prophet.predict(len(val_series))
    df['pred'] = forecast.values().reshape(-1,)
    val_pred = pd.concat([val_pred, df], axis=0)

In [None]:
val_pred.plot(x='num_sold', y='pred', kind='scatter', figsize=(10, 4));

Now let's train with full training set and predict test set.

In [None]:
def training(df):
    df_groups = df.groupby(['country', 'store', 'product'])
    models = {}
    print('Training')
    for group, df in df_groups:
        country = group[0]
        country_holidays = holidays.query('country == @country').drop('country', axis=1)
        series = TimeSeries.from_dataframe(df, time_col='date', value_cols='num_sold')    
        prophet = Prophet(holidays=country_holidays)
        prophet.fit(series)
        models[group] = prophet
    return models

def inference(models, df):
    df_groups = df.groupby(['country', 'store', 'product'])
    df_pred = pd.DataFrame()

    for group, df in df_groups:
        prophet = models[group]
        forecast = prophet.predict(len(df))
        df['pred'] = forecast.values().reshape(-1,)
        df_pred = pd.concat([df_pred, df], axis=0)
        
    return df_pred

In [None]:
models = training(train)

In [None]:
test_pred = inference(models, test)

In [None]:
test_pred.head()

In [None]:
plt.figure(figsize=(20, 10))
test_pred.groupby(['country', 'store', 'product']).pred.plot();
plt.legend();

In [None]:
submission = test_pred[['row_id', 'pred']].rename({'pred': 'num_sold'}, axis=1).sort_values('row_id')
submission.head()

Now submit the results.

In [None]:
submission.to_csv('submission.csv', index=False)