![](https://storage.googleapis.com/kaggle-competitions/kaggle/28009/logos/header.png?)

# Overview

For this challenge, you will be predicting a full year worth of sales for three items at two stores located in three different countries. This dataset is completely fictional, but contains many effects you see in real-world data, e.g., weekend and holiday effect, seasonality, etc.

## Files

* `train.csv` - the training set, which includes the sales data for each date-country-store-item combination.

* `test.csv` - the test set; your task is to predict the corresponding item sales for each date-country-store-item combination. Note the Public leaderboard is scored on the first quarter of the test year, and the Private on the remaining.

* `sample_submission.csv` - a sample submission file in the correct format


# Setup

In [None]:
import warnings

warnings.filterwarnings('ignore', 'SettingWithCopyWarning')
warnings.filterwarnings('ignore', 'UndefinedMetricWarning')
warnings.filterwarnings('ignore', 'ConvergenceWarning')

In [None]:
!pip install holidays >/dev/null 2>&1

In [None]:
import os
import random
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import holidays

from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.statespace.sarimax import SARIMAX
from dateutil.parser import parse

from IPython.display import display, Markdown, Latex

In [None]:
# matplotlib
plt.rc('font', size=15)
plt.rc('axes', titlesize=18)  
plt.rc('xtick', labelsize=10)  
plt.rc('ytick', labelsize=10)
plt.rcParams.update({'figure.figsize': (10, 7), 'figure.dpi': 80})

# seaborn
sns.set(font_scale = 1.2)
sns.set_style("whitegrid")

In [None]:
class Cfg:
    RANDOM_STATE = 2022

    TRAIN_DATA = '../input/tabular-playground-series-jan-2022/train.csv'
    TEST_DATA = '../input/tabular-playground-series-jan-2022/test.csv'
    SUBMISSION = '../input/tabular-playground-series-jan-2022/sample_submission.csv'    
    SUBMISSION_FILE = 'submission.csv'
    
    INDEX = 'row_id'
    FEATURES = ['date', 'country', 'store', 'product']
    TARGET = 'num_sold'
    
    WEEKDAY_NAMES = [
        'Monday', 
        'Tuesday', 
        'Wednesday', 
        'Thursday', 
        'Friday', 
        'Saturday',
        'Sunday']

    MONTH_NAMES = [
        'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
        'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    
    @staticmethod
    def set_seed():
        random.seed(Cfg.RANDOM_STATE)
        np.random.seed(Cfg.RANDOM_STATE)

Cfg.set_seed()

In [None]:
def write(text:str) -> None:
    display(Markdown(text))

# Read data

In [None]:
def read_data(
    train_file:str=Cfg.TRAIN_DATA, 
    test_file:str=Cfg.TEST_DATA,
    submission_file:str=Cfg.SUBMISSION
) -> (pd.DataFrame, pd.DataFrame):
    """Reads the train and test data files.
    """
    # read csv files
    train_df = pd.read_csv(train_file).set_index(Cfg.INDEX)
    test_df = pd.read_csv(test_file).set_index(Cfg.INDEX)
    submission_df = pd.read_csv(test_file).set_index(Cfg.INDEX)
    
    return train_df, test_df, submission_df

In [None]:
train_data, test_data, submission_df = read_data()

In [None]:
train_data

In [None]:
test_data

In [None]:
submission_df.head()

In [None]:
write('* Train data: {} rows'.format(len(train_data)))
write('* Test data: {} rows'.format(len(test_data)))
write('* Train data: {} columns'.format(len(train_data.columns)))

### Notice

* The training data contains 26,298 rows.

* The test data contains 6,570 rows.

* There is one datetime attribute `date` which has values from `2015-01-01` to `2018-12-31`.

* The target variable `num_sold` is numerical and has a range from 70 unil 2,884.

* There are 3 cateorical features:
    * `country` - 'Finland', 'Norway', 'Sweden' 
    * `store` - 'KaggleMart', 'KaggleRama'
    * `product` - 'Kaggle Mug', 'Kaggle Hat', 'Kaggle Sticker'



# Missing values

In [None]:
pd.DataFrame({
    'data_set': ['train', 'test'],
    'missing_values': [
        train_data.isna().sum().sum(), 
        test_data.isna().sum().sum()
    ]
}).set_index('data_set')

### Notice

* There are no missing values in both data sets.

# Feature engineering

## Convert data types

In [None]:
def convert_dtypes(df:pd.DataFrame) -> pd.DataFrame:
    df['country'] = pd.Categorical(df['country'], ordered=False)
    df['store'] = pd.Categorical(df['store'], ordered=False)
    df['product'] = pd.Categorical(df['product'], ordered=False)
    df['date'] = pd.to_datetime(df['date'])
    
    return df

In [None]:
train_data = convert_dtypes(train_data)
test_data = convert_dtypes(test_data)

In [None]:
def add_datatime(df:pd.DataFrame) -> pd.DataFrame:
    """
    """
    def is_weekend(day:str) -> bool:
        return (day == 'Saturday') or (day == 'Sunday')

    df['day'] = pd.Categorical(df['date'].dt.day)
    df['year'] = pd.Categorical(df['date'].dt.year)
    
    df['weekday'] = pd.Categorical(
        df['date'].dt.day_name(), 
        ordered=True, categories=Cfg.WEEKDAY_NAMES)  
    
    df['month'] = pd.Categorical(
        df['date'].dt.month_name().str[:3], 
        ordered=True, categories=Cfg.MONTH_NAMES)  

    df['is_weekend'] = df['weekday'].map(is_weekend)
    
    months = df['date'].dt.year.values
    years =  df['date'].dt.month.values
    df['year_month'] = [str(y) + '/' + str(m) for y, m in zip(months, years)]   

    return df

In [None]:
def add_holidays(df:pd.DataFrame, country:str) -> pd.DataFrame:
    """Add holiday by country ('finland', 'norway', 'sweden')
    """
    if country == 'finland':
        provider = holidays.Finland()
    elif country == 'norway':
        provider = holidays.Norway()
    elif country == 'sweden':
        provider = holidays.Sweden()

    get_holidays = np.frompyfunc(provider.get, nin=1, nout=1)
    is_holiday= get_holidays(np.array(df['date'].map(lambda d: d.strftime('%Y-%m-%d'))))

    df[f'{country}_holiday'] = is_holiday
    return df

In [None]:
train_data = add_datatime(train_data)
test_data = add_datatime(test_data)

In [None]:
for country in ['finland', 'norway', 'sweden']:
    add_holidays(train_data, country)
    add_holidays(test_data, country)

In [None]:
train_data.head()

# Exploratory data analysis (EDA)

## Distribution of sales

In [None]:
def plot_dist(
    data, 
    *, 
    hue=None, 
    title='Distribution of sales',
    xlabel='Month',
    ax=None
) -> None:
    """
    """
    if ax is None:
        ax = plt.gca()

    sns.kdeplot(
        data=train_data, 
        x=Cfg.TARGET,
        hue=hue,
        ax=ax)         
        
    ax.set_title(title)
    ax.set_xlabel('# Sold products')
    ax.set_ylabel('Proportion')

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

subjects = ['country', 'product', 'store', None]

for subject, ax in zip(subjects, axes.flatten()):
    plot_dist(train_data, hue=subject, ax=ax, title=f'Distribution of solds per {subject}')

plt.tight_layout()
plt.show()

In [None]:
def plot_boxplot(
    data, 
    *, 
    hue=None, 
    x='month', 
    title='Sales per month',
    xlabel='Month',
    ax=None
) -> None:
    """
    """
    if ax is None:
        ax = plt.gca()

    sns.boxplot(
        data=data, 
        x=x, 
        y=Cfg.TARGET,
        hue=hue,
        ax=ax)

    ax.set_title(title)

    ax.set_xlabel(xlabel)
    ax.set_ylabel('# Sold products')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(18, 6))

plot_boxplot(train_data, x='year_month', title='Month-wise bloxplot', ax=ax)

ax.set_xticklabels(
    ax.get_xticklabels(), 
    rotation=45, 
    horizontalalignment='right'
)

plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

subjects = ['country', 'product', 'store', None]

for subject, ax in zip(subjects, axes.flatten()):
    plot_boxplot(train_data, x=subject, ax=ax, title=f'Boxplot - Sold per {subject}')
    
plt.tight_layout()
plt.show()

In [None]:
train_data.describe().T

## Sales per month and weekday

In [None]:
def plot_line(
    data, 
    *, 
    hue=None, 
    x='month',
    title='Sales per month',
    xlabel='Month',
    ax=None
) -> None:
    """
    """
    if ax is None:
        ax = plt.gca()

    sns.lineplot(
        data=data, 
        x=x, 
        y=Cfg.TARGET,
        hue=hue,
        ax=ax)

    ax.set_title(title)

    ax.set_xlabel(xlabel)
    ax.set_ylabel('# Sold products')

In [None]:
subjects = ['country', 'product', 'store']
for subject in subjects:
    write(f'### Sales per month ({subject})')

    fig, ax = plt.subplots(2, 1, figsize=(15, 9))

    plot_line(train_data, x='month', hue=subject, ax=ax[0])
    plot_boxplot(train_data, hue=subject, ax=ax[1], title=f'Boxplot - Sales per month ({subject})')

    plt.tight_layout()
    plt.show()

## Sales per weekday

In [None]:
subjects = ['country', 'product', 'store']
for subject in subjects:
    write(f'### Sales per weekday ({subject})')

    fig, ax = plt.subplots(2, 1, figsize=(15, 9))
    
    plot_line(train_data, hue=subject, x='weekday', 
        ax=ax[0], xlabel='Weekday', title=f'Sales per weekday ({subject})')
    
    plot_boxplot(train_data, hue=subject, x='weekday',
        ax=ax[1], xlabel='Weekday', title=f'Boxplot - Sales per weekday ({subject})')

    plt.tight_layout()
    plt.show()

## Time series

In [None]:
def plot_timeline(
    data, 
    *, 
    hue=None, 
    x='date', 
    title='Time series',
    xlabel='Time',
    freq='d',
    ax=None
) -> None:
    """
    """
    if ax is None:
        ax = plt.gca()

    df = data.groupby([pd.Grouper(key='date', axis=0,  freq=freq), hue]).sum()
    sns.lineplot(
        data=df, 
        x=x, 
        y=Cfg.TARGET,
        hue=hue,
        ax=ax)

    ax.set_title(title)

    ax.set_xlabel(xlabel)
    ax.set_ylabel('# Sold products')

In [None]:
subjects = ['country', 'product', 'store']
for subject in subjects:
    write(f'### Time series ({subject})')
    
    fig, ax = plt.subplots(5, 1, figsize=(15, 20))
    
    plot_timeline(train_data, hue=subject, freq='Y', 
        title='Time series (annual)', ax=ax[0])
    
    plot_timeline(train_data, hue=subject, freq='Q', 
        title='Time series (quarterly)', ax=ax[1])
    
    plot_timeline(train_data, hue=subject, freq='M', 
        title='Time series (monthly)', ax=ax[2])
    
    plot_timeline(train_data, hue=subject, freq='w', 
        title='Time series (weekly)', ax=ax[3])
    
    plot_timeline(train_data, hue=subject, freq='d', 
        title='Time series (daily)', ax=ax[4])

    plt.tight_layout()
    plt.show()

# Decompose the time series

In [None]:
def split_data(data, dt):
    """
    """
    train = data[data.index < dt]
    test = data[data.index > dt]
    
    return train, test

In [None]:
def filter_data(data:pd.DataFrame, country:str, store:str, product:str) -> pd.DataFrame:
    """
    """
    selector = (data['country'] == country) & (data['store'] == store) & (data['product'] == product)
    df = data[selector][['date', 'num_sold']]
    df['date'] = pd.DatetimeIndex(df['date'])

    return df.set_index('date')

In [None]:
def display_decompose(data, result, err_line=0):
    """
    """
    fig, ax = plt.subplots(4, 1, figsize=(12, 15))

    sns.lineplot(data=df['num_sold'], ax=ax[0])
    sns.lineplot(data=result.trend, ax=ax[1])
    sns.lineplot(data=result.seasonal, ax=ax[2])

    sns.scatterplot(
        data=result.resid, 
        alpha=0.7,
        ax=ax[3])

    ax[3].hlines(
        y=err_line, 
        color='red',
        linestyles='--',
        xmin=min(result.resid.index), 
        xmax=max(result.resid.index))

    plt.tight_layout()
    plt.show() 

## Additive Decomposition

In [None]:
df = filter_data(train_data, country='Finland', store='KaggleRama', product='Kaggle Sticker')

result = seasonal_decompose(
    x=df['num_sold'], 
    model='additive', 
    extrapolate_trend='freq', 
    period=30)

display_decompose(df, result)

## Multiplicative Decomposition

In [None]:
df = filter_data(train_data, country='Finland', store='KaggleRama', product='Kaggle Sticker')

result = seasonal_decompose(
    x=df['num_sold'], 
    model='multiplicative', 
    extrapolate_trend='freq', 
    period=30)

display_decompose(df, result, err_line=1)

To be continued ... 