In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from datetime import datetime, date
from calendar import day_name

# Load Train Data

In [None]:
df = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv', index_col='row_id')
df.date= pd.to_datetime(df.date)
df.head()

In [None]:
def convert_detail_days(frame, column):
    frame['Year'] = frame[column].apply(lambda x: x.year)
    frame['Month'] = frame[column].apply(lambda x: x.month)
    frame['DayOfMonth'] = frame[column].apply(lambda x: x.day)
    frame['DayOfWeek'] = frame[column].apply(lambda x: day_name[x.dayofweek])
    frame['DayOfYear'] = frame[column].apply(lambda x: x.dayofyear)
    frame['WeekOfYear'] = frame[column].apply(lambda x: x.weekofyear)
# ----------------
convert_detail_days(df, 'date')
df.head()

# Explore Data

In [None]:
def plot_query_store(store, ax):
    df_agg = df[df.store == store].groupby(by=['WeekOfYear', 'DayOfWeek']).median().reset_index()
    sb.heatmap(
        df_agg.pivot(
            index='WeekOfYear',
            columns='DayOfWeek',
            values='num_sold'
        )[['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']],
        cmap="YlGnBu",
        vmin=0, 
        vmax=1200,
        ax=ax
    )
    ax.set_title(store)
# --------------------
f, axes = plt.subplots(1, 2, figsize=(12,11))
plot_query_store('KaggleMart', ax=axes[0])
plot_query_store('KaggleRama', ax=axes[1])
plt.show()

In [None]:
def plot_query_week_and_feature(store, year, column, ax):
    df_agg = df[(df.store == store) & (df.Year == year)].groupby(by=['DayOfWeek', column]).median().reset_index()
    df_agg.num_sold = df_agg.num_sold.astype(int)
    sb.heatmap(
        df_agg.pivot(
            index=column,
            columns='DayOfWeek',
            values='num_sold'
        )[['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']].T,
        cmap="YlGnBu",
        vmin=0, 
        vmax=1000,
        annot=True, fmt="d",
        ax=ax
    )
    ax.set_title(store + ' - ' + str(year))

In [None]:
for year in [2015, 2016, 2017, 2018]:
    f, axes = plt.subplots(1, 2, figsize=(15,3))
    plot_query_week_and_feature('KaggleMart', year = year, column='product', ax = axes[0])
    plot_query_week_and_feature('KaggleRama', year = year, column='product', ax = axes[1])
    plt.show()

In [None]:
for year in [2015, 2016, 2017, 2018]:
    f, axes = plt.subplots(1, 2, figsize=(15,3))
    plot_query_week_and_feature('KaggleMart', year = year, column='country', ax = axes[0])
    plot_query_week_and_feature('KaggleRama', year = year, column='country', ax = axes[1])
    plt.show()    

# Explore Time Series

In [None]:
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.statespace.sarimax import SARIMAX

In [None]:
from IPython.display import display_html
def displaySideBySide(dict_table):
    style = 'style="color: black; text-align: center; font-size: 14px; font-weight: bold;"'
    html_str=''
    for name, df in dict_table.items():
        html_str += df.to_html().replace(
            '<table border="1" class="dataframe">',
            '<table border="1" class="dataframe"> <caption ' + style + '>' + name + '</caption>'
        )
    display_html(html_str.replace('table','table style="display:inline"'),raw=True)

# KaggleRama - Norway - Kaggle Hat

In [None]:
def groupByWeekOfYear(frame, store, country, product):
    fr = frame[
        (frame.store == store) & 
        (frame.country == country) & 
        (frame['product'] == product) 
    ].groupby(by=['Year','WeekOfYear']).sum().reset_index()
    fr.index = list(fr.index + 1)
    fr = fr[['num_sold']]
    return fr
# ----------------
df_KaggleRama = groupByWeekOfYear(df, store='KaggleRama', country='Norway', product='Kaggle Hat') 
print(df_KaggleRama.shape)
df_KaggleRama.head()

In [None]:
def plot_acf_pacf(series, maxlags=20):
    fig =plt.figure(figsize=(15,8), dpi= 100)
    fig.subplots_adjust(bottom=0.025, left=0.025, top = 0.975, right=0.975)
    ax1 = fig.add_subplot(2, 1, 1)
    ax1.plot(
        series.index,
        series.values
    )
    ax2 = fig.add_subplot(2, 2, 3)
    plot_acf(series, lags=50, ax=ax2)
    ax3 = fig.add_subplot(2, 2, 4)
    plot_pacf(series, lags=maxlags, method='ywm', ax=ax3)
    plt.show()
# -------------
plot_acf_pacf(df_KaggleRama.num_sold, maxlags=50)

In [None]:
def plot_seasonal_decompose(data, model='add', period=12):
    result_add = seasonal_decompose(
        data, 
        model='add',
        period=period,
        extrapolate_trend='freq'
    )
    plt.rcParams.update({'figure.figsize': (12,8)})
    result_add.plot().suptitle('', fontsize=22)
    plt.show()
    return result_add
# ----------------
decomposeResult = plot_seasonal_decompose(
    df_KaggleRama.num_sold, 
    model='add',
    period=53,
)

In [None]:
def test_Augmented_Dickey_Fuller(data, anpha=0.05):
    result = adfuller(data)
    stationary = 'is non-stationary' if (result[1] >= anpha) else 'is stationary'
    print('ADF Statistic: %f' % result[0])
    print('p-value:', result[1], '->', stationary)
    print('Critical Values:')
    for key, value in result[4].items():
        print('\t%s: %.3f' % (key, value))
# ------------------
test_Augmented_Dickey_Fuller(df_KaggleRama.num_sold)

In [None]:
def split_Train_Test_TimeSeries(frame, split):
    n_length = len(frame) - split
    train = frame[0: (n_length)]
    test = frame[n_length : len(frame)]
    return train, test
# --------------
train_Rama, test_Rama = split_Train_Test_TimeSeries(df_KaggleRama.num_sold, 15)
print(train_Rama.shape)
print(test_Rama.shape)

In [None]:
def fit_Model_SARIMA(train, test, order, seasonal_order, forecast=5, freq='D', isShowAIC=True):
    model = SARIMAX(
        train,
        order=order,
        seasonal_order=seasonal_order,
        freq=freq
    )
    model_fit = model.fit()
    # print(str(test.head(1).index[0]) , ' - ' , str(test.tail(1).index[0]))
    pre_val = model_fit.predict(
        start = test.head(1).index[0], 
        end = test.tail(1).index[0]
    )
    fc_val = model_fit.forecast(len(test) + forecast)
    fc_val = fc_val.tail(forecast)
    if isShowAIC:
        print('AIC: ', model_fit.summary())
    return pre_val, fc_val, model_fit
# ------------------------
predict_values_Rama, forecast_values_Rama, model_ts = fit_Model_SARIMA(
    train=train_Rama,
    test=test_Rama,
    order=(0,1,1),
    seasonal_order=(1,1,0,53),
    forecast=5,
    freq=None
)

In [None]:
def plot_Show_Model_TimeSeries(train, test, predict, forecast, isFullTrain=True, numberTrain=15):
    plt.figure(figsize=(15, 3))
    if isFullTrain == True:
        plt.plot(
            train.index,
            train.values,
            color='gray',
            label='train'
        )
    else:
        end = len(train)
        start = end - numberTrain
        train_show = train[start : end]
        plt.plot(
            train_show.index,
            train_show.values,
            color='gray',
            label='train'
        )
    plt.plot(
        test.index,
        test.values,
        color='blue',
        label='test'
    )
    plt.plot(
        predict.index,
        predict.values,
        color='red',
        label='predict'
    )
    plt.plot(
        forecast.index,
        forecast.values,
        color='green',
        label='forecast'
    )
    plt.legend(loc ="upper left")
    plt.show()
# ---------------    
plot_Show_Model_TimeSeries(
    train_Rama, test_Rama, predict_values_Rama, forecast_values_Rama,
    isFullTrain=False, numberTrain=15
)

# KaggleMart - Norway - Kaggle Hat

In [None]:
df_KaggleMart = groupByWeekOfYear(df, store='KaggleMart', country='Norway', product='Kaggle Hat') 
print(df_KaggleMart.shape)
df_KaggleMart.head()

In [None]:
plot_acf_pacf(df_KaggleMart.num_sold, maxlags=50)

In [None]:
decomposeResult = plot_seasonal_decompose(
    df_KaggleMart.num_sold, 
    model='add',
    period=53,
)

In [None]:
test_Augmented_Dickey_Fuller(df_KaggleMart.num_sold)

In [None]:
train_Mart, test_Mart = split_Train_Test_TimeSeries(df_KaggleMart.num_sold, 15)
print(train_Mart.shape)
print(test_Mart.shape)

In [None]:
predict_values_Mart, forecast_values_Mart, model_ts = fit_Model_SARIMA(
    train=train_Mart,
    test=test_Mart,
    order=(0,1,1),
    seasonal_order=(1,1,0,53),
    forecast=5,
    freq=None
)

In [None]:
plot_Show_Model_TimeSeries(
    train_Mart, test_Mart, predict_values_Mart, forecast_values_Mart,
    isFullTrain=False, numberTrain=15
)

# Compare KaggleRama - KaggleMart

In [None]:
displaySideBySide({
    'KaggleRama': pd.DataFrame({
        'Test' : test_Rama,
        'Predict' : predict_values_Rama
    }),
    'KaggleMart': pd.DataFrame({
        'Test' : test_Mart,
        'Predict' : predict_values_Mart
    }),
    'Forecast_Rama' : pd.DataFrame({
        'Forecast' : forecast_values_Rama
    }),
    'Forecast_Mart' : pd.DataFrame({
        'Forecast' : forecast_values_Mart
    })
})

In [None]:
def plot_compare(preR, forR, preM, forM, title=''):
    plt.figure(figsize=(7, 3))
    plt.plot(
        pd.concat([preR, forR]).index,
        pd.concat([preR, forR]).values,
        color='#BA4A00',
        label='Rama'
    )
    plt.plot(
        pd.concat([preM, forM]).index,
        pd.concat([preM, forM]).values,
        color='#F0B27A',
        label='Mart'
    )
    plt.title(title)
    plt.legend(loc ="upper left")
    plt.show()
# -----------------
plot_compare(
    preR = predict_values_Rama,
    forR = forecast_values_Rama,
    preM = predict_values_Mart,
    forM = forecast_values_Mart,
    title='Norway - Kaggle Hat'
)

# Other Country - Product

In [None]:
import sys
if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

In [None]:
# country = 'Norway'
# product = 'Kaggle Hat'
list_country = ['Finland', 'Sweden']
list_product = ['Kaggle Mug', 'Kaggle Sticker']
for country in list_country:
    for product in list_product:
        
        df_KaggleMart = groupByWeekOfYear(df, store='KaggleMart', country=country, product=product) 
        df_KaggleRama = groupByWeekOfYear(df, store='KaggleRama', country=country, product=product)

        train_Rama, test_Rama = split_Train_Test_TimeSeries(df_KaggleRama.num_sold, 15)
        train_Mart, test_Mart = split_Train_Test_TimeSeries(df_KaggleMart.num_sold, 15)

        predict_values_Rama, forecast_values_Rama, model_Rama = fit_Model_SARIMA(
            train=train_Rama, test=test_Rama,
            order=(0,1,1), seasonal_order=(1,1,0,53),
            forecast=5, isShowAIC=False, freq=None
        )
        predict_values_Mart, forecast_values_Mart, model_Mart = fit_Model_SARIMA(
            train=train_Mart, test=test_Mart,
            order=(0,1,1), seasonal_order=(1,1,0,53),
            forecast=5, isShowAIC=False, freq=None
        )
        plot_compare(
            preR = predict_values_Rama,
            forR = forecast_values_Rama,
            preM = predict_values_Mart,
            forM = forecast_values_Mart,
            title= country + ' - ' + product
        )

# Decide

Select KaggleRama, because The values higher The KaggleMart in all product and country.