In [None]:
# data processing
import numpy as np 
import pandas as pd

# for extracting holidays
import dateutil.easter as easter

# sklearn baseline models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

# scale numerical features
from sklearn.preprocessing import MinMaxScaler

# for data visualization
import matplotlib.pyplot as plt
import seaborn as sns
# for visualizing decission tree
from sklearn import tree

# for reproducibility
RANDOM_SEED = 42

plt.style.use('ggplot')

## My scores on public leaderboad

* **linear regression:** 25.46780,
* **decision tree:** 9.56904 (basic data).
* **decision tree:** 9.77822 (added holiday data).
* **decision tree:** 6.45971 (added GDP data).

## 1. Load data

In [None]:
# Read basic data
df_train = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv')

# convert dates to datetimes
for _df in [df_train, df_test]:
    _df.date = pd.to_datetime(_df.date)
    
# create features from datetime object
for _df in [df_train, df_test]:
    _df['year'] = _df['date'].apply(lambda x: x.year)
    _df['month'] = _df['date'].apply(lambda x: x.month)
    _df['quarter'] = _df.date.dt.quarter
    _df['day'] = _df['date'].apply(lambda x: x.day)
    _df['wd'] = _df['date'].apply(lambda x: x.weekday())
    _df['weekend'] = _df['wd'].isin([5, 6]).astype(int)
    _df['day_of_year'] = _df.date.dt.dayofyear  
    _df['week_of_year'] = _df.date.dt.isocalendar().week
    _df['is_friday'] = np.where((_df['wd'] == 4), 1, 0)

In [None]:
def holiday_features(holiday_df, df):
    """
    This function taken from:
    https://www.kaggle.com/maxencefzr/tps-jan22-catboost-using-pycaret
    """
    fin_holiday = holiday_df.loc[holiday_df.country == 'Finland']
    swe_holiday = holiday_df.loc[holiday_df.country == 'Sweden']
    nor_holiday = holiday_df.loc[holiday_df.country == 'Norway']
    
    df['fin holiday'] = df.date.isin(fin_holiday.date).astype(int)
    df['swe holiday'] = df.date.isin(swe_holiday.date).astype(int)
    df['nor holiday'] = df.date.isin(nor_holiday.date).astype(int)
    
    df['holiday'] = np.zeros(df.shape[0]).astype(int)
    
    df.loc[df.country == 'Finland', 'holiday'] = df.loc[df.country == 'Finland', 'fin holiday']
    df.loc[df.country == 'Sweden', 'holiday'] = df.loc[df.country == 'Sweden', 'swe holiday']
    df.loc[df.country == 'Norway', 'holiday'] = df.loc[df.country == 'Norway', 'nor holiday']
    
    df.drop(['fin holiday', 'swe holiday', 'nor holiday'], axis=1, inplace=True)
    
    # Easter
    easter_date = df.date.apply(lambda date: pd.Timestamp(easter.easter(date.year)))
    df['days_from_easter'] = (df.date - easter_date).dt.days.clip(-5, 65)
    
    # Last Sunday of May (Mother's Day)
    sun_may_date = df.date.dt.year.map({
        2015: pd.Timestamp(('2015-5-31')),
        2016: pd.Timestamp(('2016-5-29')),
        2017: pd.Timestamp(('2017-5-28')),
        2018: pd.Timestamp(('2018-5-27')),
        2019: pd.Timestamp(('2019-5-26'))
    })
    #new_df['days_from_sun_may'] = (df.date - sun_may_date).dt.days.clip(-1, 9)
    
    # Last Wednesday of June
    wed_june_date = df.date.dt.year.map({
        2015: pd.Timestamp(('2015-06-24')),
        2016: pd.Timestamp(('2016-06-29')),
        2017: pd.Timestamp(('2017-06-28')),
        2018: pd.Timestamp(('2018-06-27')),
        2019: pd.Timestamp(('2019-06-26'))
    })
    df['days_from_wed_jun'] = (df.date - wed_june_date).dt.days.clip(-5, 5)
    
    # First Sunday of November (second Sunday is Father's Day)
    sun_nov_date = df.date.dt.year.map({
        2015: pd.Timestamp(('2015-11-1')),
        2016: pd.Timestamp(('2016-11-6')),
        2017: pd.Timestamp(('2017-11-5')),
        2018: pd.Timestamp(('2018-11-4')),
        2019: pd.Timestamp(('2019-11-3'))
    })
    df['days_from_sun_nov'] = (df.date - sun_nov_date).dt.days.clip(-1, 9)
    
    return df

In [None]:
# read outsourced data
festivities = pd.read_csv("../input/festivities-in-finland-norway-sweden-tsp-0122/nordic_holidays.csv",
                          parse_dates=['date'],
                          usecols=['date', 'country', 'holiday'])

# add holiday information
df_train = holiday_features(festivities, df_train)
df_test = holiday_features(festivities, df_test)

In [None]:
# process GDP
gdp = pd.read_csv("../input/gdp-20152019-finland-norway-and-sweden/GDP_data_2015_to_2019_Finland_Norway_Sweden.csv")
gdp = np.concatenate([gdp[['year', 'GDP_Finland']].values, 
                      gdp[['year', 'GDP_Norway']].values, 
                      gdp[['year', 'GDP_Sweden']].values])
gdp = pd.DataFrame(gdp, columns=['year', 'gdp'])
gdp['country'] = ['Finland']*5 + ['Norway']*5 +['Sweden']*5

# add data
for _df in [df_train, df_test]:
    gdp_countries = _df.merge(gdp, on=['country', 'year'], how='left')['gdp'].values
    for country in ['Finland', 'Norway', 'Sweden']:
        _df['gdp_'+ country] = gdp_countries * (_df['country']==country).astype(int)

## 2. Data visualizations

### 2.1 Yearly results

In [None]:
# aggregate data
df_g = df_train.groupby(['year', 'product', 'date'])['num_sold'].sum()

# greate plotting function
fig, axes = plt.subplots(2, 2, figsize=(24, 8), sharey=True)

axes = axes.flatten() 

for i, _year in enumerate(range(2015, 2019)):
    _df =  df_g[_year].reset_index()
    sns.lineplot(data=_df, x='date', y='num_sold', hue='product', linewidth=2.5, ax=axes[i])
    
plt.tight_layout()

### 2.2 sales by product group

In [None]:
# aggregate data
df_g = df_train.groupby(['product', 'year', 'date'])['num_sold'].sum()

# greate plotting function
fig, axes = plt.subplots(3, 1, figsize=(24, 8), sharey=True)

axes = axes.flatten() 

for i, _product in enumerate(df_train['product'].unique()):
    _df =  df_g[_product].reset_index()
    axes[i].set_title(_product)
    sns.lineplot(data=_df, x='date', y='num_sold', hue='year', linewidth=2.5, ax=axes[i], palette="tab10")
    
plt.tight_layout()

### 2.3 sales by store

In [None]:
# aggregate data
df_g = df_train.groupby(['store', 'year', 'date'])['num_sold'].sum()

# greate plotting function
fig, axes = plt.subplots(2, 1, figsize=(24, 8), sharey=True)

axes = axes.flatten() 

for i, _store in enumerate(df_train['store'].unique()):
    _df =  df_g[_store].reset_index()
    axes[i].set_title(_store)
    sns.lineplot(data=_df, x='date', y='num_sold', hue='year', linewidth=2.5, ax=axes[i], palette="tab10")
    
plt.tight_layout()

### 2.4 sales by country

In [None]:
# aggregate data
df_g = df_train.groupby(['country', 'year', 'date'])['num_sold'].sum()

# greate plotting function
fig, axes = plt.subplots(3, 1, figsize=(24, 8), sharey=True)

axes = axes.flatten() 

for i, _country in enumerate(df_train['country'].unique()):
    _df =  df_g[_country].reset_index()
    axes[i].set_title(_country)
    sns.lineplot(data=_df, x='date', y='num_sold', hue='year', linewidth=2.5, ax=axes[i], palette="tab10")
    
plt.tight_layout()

## 3. Feature engineering

### 3.1 `min`-`max` scale days.

In 2015 Februrary had 28 days and in 2016 29. `days` feature will be `min-max` scaled based on the number of days durring specific year, i.e. 28th of Febuary will be equal to the 31-st of January.

In [None]:
# create new DataFrames for transformed features
X_train = df_train.copy()
X_test = df_test.copy()

# extract targets
y_train = X_train.pop('num_sold')

In [None]:
# calculate days per month
DAYS_PER_MONTH = pd.concat([df_train, df_test])
DAYS_PER_MONTH = DAYS_PER_MONTH.groupby(['year','month']).agg({'day': ['max']})
DAYS_PER_MONTH = DAYS_PER_MONTH.reset_index()
DAYS_PER_MONTH.columns = ['year', 'month', 'max_days']
DAYS_PER_MONTH.head()

In [None]:
# get max days for both train and test datasets
X_train = pd.merge(X_train, DAYS_PER_MONTH, on=['year', 'month'], how='left')
X_train['day'] = X_train['day'] / X_train['max_days']
del X_train['max_days']

X_test = pd.merge(X_test, DAYS_PER_MONTH, on=['year', 'month'], how='left')
X_test['day'] = X_test['day'] / X_test['max_days']
del X_test['max_days']

In [None]:
# scale other numerical features
for col in ['day_of_year', 'days_from_easter', 'days_from_wed_jun', 'days_from_sun_nov']:
    scaler = MinMaxScaler()
    scaler.fit(X_train[col].values.reshape(-1, 1))
    # transform
    X_train[col] = scaler.transform(X_train[col].values.reshape(-1, 1))
    X_test[col] = scaler.transform(X_test[col].values.reshape(-1, 1))

### 3.2 one-hot-encoding

In [None]:
# one-hot encode categorical features
X_train = pd.get_dummies(X_train, columns=['country', 'store', 'product', 'month'])
X_test = pd.get_dummies(X_test, columns=['country', 'store', 'product', 'month'])

In [None]:
# split train data into train and validation sections
_X_train = X_train.loc[X_train.year != 2018].copy()
_X_valid = X_train.loc[X_train.year == 2018].copy()

## 4. Modeling

### Helper functions

In [None]:
# https://www.kaggle.com/c/web-traffic-time-series-forecasting/discussion/36414
# https://www.kaggle.com/teckmengwong/tps2201-hybrid-time-series
def smape_loss(y_true, y_pred):
    """
    SMAPE Loss
    Parameters
    ----------
    y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)
        Ground truth (correct) target values.
    y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)
        Estimated target values.
    Returns
    -------
    loss : float or ndarray of floats
        If multioutput is 'raw_values', then mean absolute error is returned
        for each output separately.
        If multioutput is 'uniform_average' or an ndarray of weights, then the
        weighted average of all output errors is returned.
        SMAPE output is non-negative floating point. The best value is 0.0.

    """
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)

### 4.1 `linear regression` baseline model

In [None]:
# create simple lin. Regression
model_lin_reg = LinearRegression()

# train model
model_lin_reg.fit(_X_train.iloc[:, 3:], y_train.loc[_X_train.index])

# make predictions on validation data
_pred_lin_reg = model_lin_reg.predict(_X_valid.iloc[:, 3:])

# calcualte SMAPE
smape_loss_lin_reg = smape_loss(y_train.loc[_X_valid.index], _pred_lin_reg)
print(f'Vadidation data SMAPE:')
print(f'Lin. reg.: {smape_loss_lin_reg:.2f}')

In [None]:
# create new DataFrame for comparing predictions on validation dataset
valid_res = _X_valid.copy()
# add predictions for proting
valid_res['observed'] = y_train.loc[_X_valid.index]
valid_res['Linear regression'] = _pred_lin_reg
valid_res.head()

In [None]:
def plot_predictions(df_val, model_name='Linear regression'):
    """
    This function plots predictions on validation Dataset.
    """
    # get data for ploting
    df_plot = df_val.groupby('date')[['observed', model_name]].sum()
    
    # greate plotting function
    fig, axes = plt.subplots(2, 1, figsize=(24, 8))

    # temporal visualization
    axes[0].scatter(df_plot.index, df_plot['observed'], label='Observed', color='#348ABD')
    axes[0].plot(df_plot.index, df_plot[model_name], label='Model', linewidth=2.5)

    # add legend
    legend = axes[0].legend(frameon=1)
    frame = legend.get_frame()
    frame.set_facecolor('w')

    axes[0].set_xlabel('Date')
    axes[0].set_title(f'{model_name} model.')

    # histograms
    axes[1].hist([y_train.loc[_X_valid.index], _pred_lin_reg], bins=np.linspace(0, 3000, 201),
                 label=['Observed', 'Model'], color=[ '#348ABD', '#E24A33'])

    # add legends with white backgrounds
    for i in range(2):
        legend = axes[i].legend(frameon=1)
        frame = legend.get_frame()
        frame.set_facecolor('w')

    plt.tight_layout()

In [None]:
plot_predictions(valid_res, 'Linear regression')

### 4.2 Decision Tree Regressor

* baseline features.

In [None]:
# create simple lin. Regression
model_des_tree = DecisionTreeRegressor(random_state=RANDOM_SEED)

# train model
model_des_tree.fit(_X_train.iloc[:, 3:], y_train.loc[_X_train.index])

# make predictions on validation data
_pred_des_tree = model_des_tree.predict(_X_valid.iloc[:, 3:])

# calcualte SMAPE
smape_loss_des_tree = smape_loss(y_train.loc[_X_valid.index], _pred_des_tree)
print(f'Vadidation data SMAPE:')
print(f'Des. tree: {smape_loss_des_tree:.2f}')

In [None]:
# add predictions on validation data
valid_res['Decision Tree'] = _pred_des_tree
valid_res.head()

In [None]:
plot_predictions(valid_res, 'Decision Tree')

In [None]:
def plot_predictions_2(df_val, model_name='Linear regression'):
    """
    This function plots predictions on validation Dataset.
    """
    # select data for proting
    df_plot = df_train.loc[df_val.index].copy()
    # add predictions data
    df_plot['model'] = df_val[model_name]
    # agregate data
    df_plot = df_plot.groupby(['date', 'country', 'store'])[['num_sold', 'model']].sum().reset_index()
    
    # greate plotting function
    fig, axes = plt.subplots(3, 2, figsize=(24, 10))
    
    for i, country in enumerate(df_plot.country.unique()):
        for ii, store in enumerate(df_plot.store.unique()):
            # select data
            _df = df_plot.loc[(df_plot.store == store) & (df_plot.country == country)]
            axes[i, ii].scatter(_df.date, _df.num_sold, label='Observed', color='#348ABD')
            axes[i, ii].plot(_df.date, _df.model, label='Model', linewidth=2.5)
            
            # calculate smape for subsection
            _smape = smape_loss(_df.num_sold, _df.model)
            
            # add labels and legend
            axes[i, ii].set_title(f'{country.title()} - {store}. SMAPE: {_smape:.1f}')
            axes[i, ii].set_ylabel('num. sold')
            legend = axes[i, ii].legend(frameon=1)
            frame = legend.get_frame()
            frame.set_facecolor('w')
            

    plt.tight_layout()

In [None]:
plot_predictions_2(valid_res, model_name='Decision Tree')

## 5. Submission

In [None]:
# create simple lin. Regression
final_model = DecisionTreeRegressor(random_state=RANDOM_SEED)

# train model
final_model.fit(X_train.iloc[:, 3:], y_train);

# make predictions on test data
pred_test = final_model.predict(X_test.iloc[:, 3:])

In [None]:
# make submission
submission = X_test[['row_id']].copy()
submission['num_sold'] = pred_test
# round results
submission['num_sold'] = submission['num_sold'].round()
submission.to_csv('submission.csv', index=False)