# Load Data

In [None]:
import os
import math as m
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Read train / test data.
train = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv')
test = pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv')


# Feature Engineering

In [None]:
from sklearn.preprocessing import LabelEncoder
import dateutil.easter as easter
import datetime
import itertools
import operator

# A - Add year/month/day column
train['date'] = pd.to_datetime(train['date'], format='%Y-%m-%d')
test['date'] = pd.to_datetime(test['date'], format='%Y-%m-%d')

train['year'] = train['date'].apply(lambda x: x.year)
train['month'] = train['date'].apply(lambda x: x.month)
train['day'] = train['date'].apply(lambda x: x.day)

test['year'] = test['date'].apply(lambda x: x.year)
test['month'] = test['date'].apply(lambda x: x.month)
test['day'] = test['date'].apply(lambda x: x.day)

# B - Add weekday column
train['weekday'] = train['date'].apply(lambda x: x.weekday())
test['weekday'] = test['date'].apply(lambda x: x.weekday())
weekday_label = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

# C - Add GDP per capita column
GDP_PC = pd.read_csv('../input/gdp-per-capita-finland-norway-sweden-201519/GDP_per_capita_2015_to_2019_Finland_Norway_Sweden.csv', index_col='year')
GDP_PC_dict = GDP_PC.unstack().to_dict()
train['GDP_PC'] = train.set_index(['country', 'year']).index.map(GDP_PC_dict)
test['GDP_PC'] = test.set_index(['country', 'year']).index.map(GDP_PC_dict)

# D - Add snow depth column
snow = pd.read_csv('../input/finland-norway-and-sweden-weather-data-20152019/nordics_weather.csv')
snow['date'] = pd.to_datetime(snow['date'], format='%m/%d/%Y')
snow['year'] = snow['date'].apply(lambda x: x.year)
snow['month'] = snow['date'].apply(lambda x: x.month)
snow['day'] = snow['date'].apply(lambda x: x.day)
snow = snow[['country', 'year', 'month', 'day', 'snow_depth']]

snow_map = snow.set_index(['country', 'year', 'month', 'day']).to_dict()['snow_depth']

train['snow_depth'] = train.set_index(['country', 'year', 'month', 'day']).index.map(snow_map)
test['snow_depth'] = test.set_index(['country', 'year', 'month', 'day']).index.map(snow_map)

# E - Holiday (Samuel Cortinhas)
train['holiday'] = 0
holiday = pd.read_csv('../input/public-and-unofficial-holidays-nor-fin-swe-201519/holidays.csv')
holiday['date'] = pd.to_datetime(holiday['date'], format='%Y-%m-%d')

## Add Divine Mercy Sunday
holiday_dms = holiday[holiday['event']=='Easter Sunday']
holiday_dms['event'] = 'Divine Mercy Sunday'
holiday_dms['date'] = holiday_dms['date'] + datetime.timedelta(days=7)
holiday = pd.concat([holiday, holiday_dms], axis=0)

## add year, month, day
holiday['year'] = holiday['date'].apply(lambda x: x.year)
holiday['month'] = holiday['date'].apply(lambda x: x.month)
holiday['day'] = holiday['date'].apply(lambda x: x.day)

for df in [train, test]:
    fin_holiday = holiday.loc[holiday.country == 'Finland']
    swe_holiday = holiday.loc[holiday.country == 'Sweden']
    nor_holiday = holiday.loc[holiday.country == 'Norway']
    df['fin holiday'] = df.date.isin(fin_holiday.date).astype(int)
    df['swe holiday'] = df.date.isin(swe_holiday.date).astype(int)
    df['nor holiday'] = df.date.isin(nor_holiday.date).astype(int)
    df['holiday'] = np.zeros(df.shape[0]).astype(int)
    df.loc[df.country == 'Finland', 'holiday'] = df.loc[df.country == 'Finland', 'fin holiday']
    df.loc[df.country == 'Sweden', 'holiday'] = df.loc[df.country == 'Sweden', 'swe holiday']
    df.loc[df.country == 'Norway', 'holiday'] = df.loc[df.country == 'Norway', 'nor holiday']
    df.drop(['fin holiday', 'swe holiday', 'nor holiday'], axis=1, inplace=True)
    
# F - Fourier (Samuel Cortinhas)
for df in [train, test]:
    # temporary one hot encoding
    for product in ['Kaggle Mug', 'Kaggle Hat']:
        df[product] = df['product'] == product
    
    # The three products have different seasonal patterns
    dayofyear = df.date.dt.dayofyear
    for k in range(1, 2):
        df[f'sin{k}'] = np.sin(dayofyear / 365 * 2 * m.pi * k)
        df[f'cos{k}'] = np.cos(dayofyear / 365 * 2 * m.pi * k)
        df[f'mug_sin{k}'] = df[f'sin{k}'] * df['Kaggle Mug']
        df[f'mug_cos{k}'] = df[f'cos{k}'] * df['Kaggle Mug']
        df[f'hat_sin{k}'] = df[f'sin{k}'] * df['Kaggle Hat']
        df[f'hat_cos{k}'] = df[f'cos{k}'] * df['Kaggle Hat']
        df=df.drop([f'sin{k}', f'cos{k}'], axis=1)
    
# drop temporary one hot encoding
train = train.drop(['Kaggle Mug', 'Kaggle Hat'], axis=1)
test = test.drop(['Kaggle Mug', 'Kaggle Hat'], axis=1)

# Visualization
## Total sales by date

In [None]:
series = train.groupby(['year', 'date']).num_sold.sum()

fig = plt.figure(figsize=(30, 13))
for i, year in enumerate([2015, 2016, 2017, 2018]):
    ax = fig.add_subplot(2, 2, i+1)
    ax.plot(series[year].index, series[year].values)
    fig.show()
    ax.set_title(f'Sales trend in {year}', fontsize=8)
    ax.set_xlabel('Date', fontsize=8)
    ax.set_ylabel('Sales', fontsize=8)
    plt.xticks(fontsize=8)
    plt.yticks(fontsize=8)
    plt.legend(fontsize=8)
plt.show()


- Cyclical movements are influenced by the day of the week.
- Sales peak at the end of December.
- There is also a peak in April.

**need to add weekday column**

## Total sales by products.

In [None]:
series = train.groupby(['year', 'product', 'month']).num_sold.sum()

fig = plt.figure(figsize=(30, 13))
for i, year in enumerate([2015, 2016, 2017, 2018]):
    ax = fig.add_subplot(2, 2, i+1)
    ax.plot(series[year]['Kaggle Hat'].index,
            series[year]['Kaggle Hat'].values,
            label='Kaggle Hat')
    ax.plot(series[year]['Kaggle Mug'].index,
            series[year]['Kaggle Mug'].values,
            label='Kaggle Mug')
    ax.plot(series[year]['Kaggle Sticker'].index,
            series[year]['Kaggle Sticker'].values,
            label='Kaggle Sticker')
    ax.set_title(f'Sales trend in {year}', fontsize=8)
    ax.set_xlabel('Month', fontsize=8)
    ax.set_ylabel('Sales', fontsize=8)
    plt.xticks(fontsize=8)
    plt.yticks(fontsize=8)
    plt.legend(fontsize=8)
plt.show()


- Only Hat has peak in April.

**Might need to add some kind of feature, but I'm not sure...**

-> is it effected Divine Mercy Sunday?

## GDP per capita

In [None]:
plt.plot(GDP_PC.index, GDP_PC['Finland'].values, label='Finland')
plt.plot(GDP_PC.index, GDP_PC['Norway'].values, label='Norway')
plt.plot(GDP_PC.index, GDP_PC['Sweden'].values, label='Sweden')
plt.xticks([2015, 2016, 2017, 2018, 2019])
plt.xlabel('year')
plt.ylabel('GDP per capita')
plt.legend()
plt.show()

- GDP must be related to sales, and data in 2019 is important to predict.
- I think the impact of Covid-19 will also show up in GDP, but I don't know with annual data through 2019.

## Snow depth

In [None]:
series = snow.groupby(['year', 'country', 'month']).snow_depth.sum()

fig = plt.figure(figsize=(30, 13))
for i, year in enumerate([2015, 2016, 2017, 2018]):
    ax = fig.add_subplot(2, 2, i+1)
    ax.plot(series[year]['Norway'].index, series[year]['Norway'].values, label='Norway')
    ax.plot(series[year]['Sweden'].index, series[year]['Sweden'].values, label='Sweden')
    ax.plot(series[year]['Finland'].index, series[year]['Finland'].values, label='Finland')
    # ax.set_title(f'Sales trend in {year}', fontsize=8, loc='right')
    ax.set_xlabel('Day', fontsize=8)
    ax.set_ylabel('Snow depth', fontsize=8)
    plt.xticks(fontsize=8)
    plt.yticks(fontsize=8)
    plt.legend(fontsize=8)
plt.show()


- This feature might be related to Hat sales, but, really?

## Holiday in April

In [None]:
for year in [2015, 2016, 2017, 2018]:
    month = 4
    print(fin_holiday[(fin_holiday['year']==year) & (fin_holiday['month']==month)][['date', 'country', 'event', 'type']])
    print('----')
    print(swe_holiday[(swe_holiday['year']==year) & (swe_holiday['month']==month)][['date', 'country', 'event', 'type']])
    print('----')
    print(nor_holiday[(nor_holiday['year']==year) & (nor_holiday['month']==month)][['date', 'country', 'event', 'type']])
    
    series = train.groupby(['year', 'country', 'month', 'date']).num_sold.sum()
    
    fig = plt.figure(figsize=(20, 5))
    for i, country in enumerate(['Finland', 'Sweden', 'Norway']):
        ax = fig.add_subplot(1, 3, i+1)
        ax.plot(series[year][country][month].index, series[year][country][month].values)
        fig.show()
        ax.set_title(f'Sales trend in {country}', fontsize=12)
        ax.set_xlabel('Date', fontsize=8)
        ax.set_ylabel('Sales', fontsize=8)
        plt.xticks(fontsize=6)
        plt.yticks(fontsize=8)
        plt.legend(fontsize=8)
    plt.show()

- **Divine Mercy Sunday** is as effective as Easter sunday.

**We need to add Divine Mercy Sunday column**

# Pre-processing

In [None]:
train_test_concat = pd.concat([train, test])

for c in ['country', 'store', 'product']:
    # Determine how to transform the data based on the training data.
    le = LabelEncoder()

    # Transform
    train_test_concat[c] = le.fit_transform(train_test_concat[c].fillna('NA'))

train = train_test_concat[train_test_concat['row_id'].isin(train['row_id'])]
test = train_test_concat[train_test_concat['row_id'].isin(test['row_id'])]

# Delete unuse columns.
# Separate training data and target variables.
response_label = 'num_sold'
train_y = train[response_label]
drop_label = ['row_id', 'date', response_label]
train_x = train.drop([response_label, *drop_label], axis=1)
test_x = test.drop([*drop_label], axis=1)

# Modeling

In [None]:
import lightgbm as lgb
# import optuna.integration.lightgbm as lgb
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold
import seaborn as sns
from rgf.sklearn import RGFRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.metrics import mean_squared_log_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, BatchNormalization, Dropout
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor

# Define sMAPE func.
def smape(true, preds):
    return 1/len(true) * np.sum(2 * np.abs(preds-true) / (np.abs(true) + np.abs(preds)) * 100)


lgbm_params = {
            'task': 'train',
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'force_col_wise': True,
            'metric': 'mape'
}

keras_params = {'epochs': 400,
                'batch_size': 512}

et_params = {'n_estimators': 100,
             'max_features': 0.5,
             'max_depth': 18,
             'min_samples_leaf': 4,
             'n_jobs': -1}

rf_params = {'n_estimators': 125,
             'max_features': 0.2,
             'max_depth': 25,
             'min_samples_leaf': 4,
             'n_jobs': -1}

rgf_params = {'algorithm': 'RGF_Sib',
              'loss': 'Log'}


def build_fn():
    model_nn = Sequential()
    model_nn.add(Dense(64, input_dim=train_x.shape[1]))
    model_nn.add(BatchNormalization())
    model_nn.add(Activation('selu'))
    model_nn.add(Dense(128))
    model_nn.add(BatchNormalization())
    model_nn.add(Dropout(0.2))
    model_nn.add(Activation('selu'))
    model_nn.add(Dense(64, activation='selu'))
    model_nn.add(Dense(1))

    model_nn.compile(loss='mean_absolute_percentage_error',
                     optimizer='adam',
                     metrics=['MeanAbsolutePercentageError'])

    return model_nn


keras_reg = KerasRegressor(build_fn=build_fn, verbose=0, **keras_params)
keras_reg._estimator_type = 'regressor'

estimators = [
    ('lgb', lgb.LGBMRegressor(**lgbm_params)),
    ('nn', keras_reg),
    ('rgf', RGFRegressor(**rgf_params)),
    ('et', ExtraTreesRegressor(**et_params)),
    ('rf', RandomForestRegressor(**rf_params)),
    # ('lr', LinearRegression()),
    ('knn', KNeighborsRegressor())
]

model_stack = StackingRegressor(estimators=estimators,
                                final_estimator=LinearRegression())
model_stack.fit(train_x, train_y)
pred = model_stack.predict(test_x)

# Cross validation.
# Split train data into 4 folds.
#score_folds = []
#kf = KFold(n_splits=4, shuffle=True, random_state=71)
#for tr_idx, va_idx in kf.split(train_x):
#    # Separate training data and validation data.
#    tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
#    tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
#
#    # Convert feature values and objective variables to lightgbm data structure.
#    lgb_train = lgb.Dataset(tr_x, tr_y)
#    lgb_eval = lgb.Dataset(va_x, va_y)
#
#    #  Hyper param setting.
#    params = {
#        'task': 'train',
#        'boosting_type': 'gbdt',
#        'objective': 'regression',
#        'force_col_wise': True,
#        'metric': 'mape'
#    }
#    
#    num_round = 8000
#
#    # Train
#    model = lgb.train(params,
#                      train_set=lgb_train,
#                      num_boost_round=num_round,
#                      early_stopping_rounds=1000,
#                      verbose_eval=100,
#                      valid_sets=lgb_eval)
#
#    # Score confirmation with validation data.
#    va_pred = model.predict(va_x)
#    mape = smape(va_y, va_pred)
#    score_folds.append(mape)
#    best_params_folds.append(model.params)
#
#print(score_folds)
#pred = model.predict(test_x)
test['num_sold'] = pred

plt.plot(train['date'], train_y)
plt.plot(test['date'], pred)
plt.show()

- I think prediction data has enough feature of previous year.
- But, I feel like it's missing the peak of April.
- Covid-19 spread from 2019, but we may need to dig a little deeper into the impact of this.
- For example, the number of infected people and monthly GDP.

# Output

In [None]:
# Make submission file
submission = pd.DataFrame({'row_id': test['row_id'],
                           'num_sold': pred.astype('int')})
submission.to_csv('submission.csv', index=False)

# Re-Visualization
## Comparison of each year sales and predicted sales

In [None]:
series = train.groupby(['year', 'date']).num_sold.sum()
series_test = test.groupby(['date']).num_sold.sum()
fig = plt.figure(figsize=(30, 13))
for i, year in enumerate([2015, 2017, 2018]):
    ax = fig.add_subplot(2, 2, i+1)
    ax.plot(series[year].index, series[year].values, label='train')
    ax.plot(series[year].index, series_test.values, label='test')
    fig.show()
    ax.set_title(f'Sales trend in {year}', fontsize=12)
    ax.set_xlabel('Date', fontsize=8)
    ax.set_ylabel('Sales', fontsize=8)
    plt.xticks(fontsize=4)
    plt.yticks(fontsize=8)
    plt.legend(fontsize=8)
plt.show()

- Prediction data has enough feature of previous years.
- But, end-of-year sales has some amount of fluctuation each year.

**Can we add some feature that related to end-of-year sales?**

In [None]:
series = train.groupby(['year', 'country', 'product', 'date']).num_sold.sum()
series_test = test.groupby(['country', 'product', 'date']).num_sold.sum()
fig = plt.figure(figsize=(30, 45))
country_label = {0: 'Finland', 1: 'Norway', 2: 'Sweden'}
product_label = {0: 'Kaggle Mug', 1: 'Kaggle Hat', 2: 'Kaggle Speaker'}
for k, year in enumerate([2015, 2017, 2018]):
    for j, country in enumerate([0, 1, 2]):
        for i, product in enumerate([0, 1, 2]):
            ax = fig.add_subplot(9, 3, i+1+j*3+k*9)
            ax.plot(series[year][country][product].index, series[year][country][product].values, label='train')
            ax.plot(series[year][country][product].index, series_test[country][product].values, label='test')
            ax.set_title(f'Sales trend in {product_label[product]}, {country_label[country]}', fontsize=12)
            ax.set_xlabel('Date', fontsize=8)
            ax.set_ylabel('Sales', fontsize=8)
            plt.xticks(fontsize=10)
            plt.yticks(fontsize=8)
            plt.legend(fontsize=8)
plt.show()

- Weekly trend looks good. They have enough feature.
- Annual offset trend might be related in GDP.

**I don't have any idea to be added external dataset..**