In [None]:
import numpy as np 
import pandas as pd 
import math
import dateutil

from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess
from category_encoders import MEstimateEncoder
from sklearn.model_selection import KFold, cross_val_score

from scipy.special import boxcox1p, inv_boxcox1p

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        pass
        #print(os.path.join(dirname, filename))


import matplotlib.pyplot as plt
import seaborn as sns

from warnings import simplefilter
simplefilter("ignore")  # ignore warnings to clean up output cells

# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True, figsize=(11, 5))
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)
plot_params = dict(
    color="0.75",
    style=".-",
    markeredgecolor="0.25",
    markerfacecolor="0.25",
    legend=False,
)
%config InlineBackend.figure_format = 'retina'


In [None]:
df_train = pd.read_csv("/kaggle/input/tabular-playground-series-jan-2022/train.csv",parse_dates=["date"])
#GDP = pd.read_csv("/kaggle/input/gdp-20152019-finland-norway-and-sweden/GDP_data_2015_to_2019_Finland_Norway_Sweden.csv")
Holidays = pd.read_csv("/kaggle/input/holidays-finland-norway-sweden-20152019/Holidays_Finland_Norway_Sweden_2015-2019.csv", parse_dates=['Date'])
GDP = pd.read_csv("/kaggle/input/gdp-per-capita-finland-norway-sweden-201519/GDP_per_capita_2015_to_2019_Finland_Norway_Sweden.csv")

In [None]:
#Restructring GDP:
GDP_df = GDP.copy()
# For working with GDP instead of GDP per capita
# GDP_df['Finland'] = GDP_df['Finland'].copy()
# GDP_df['Norway']  = GDP_df['Norway'].copy()
# GDP_df['Sweden']  = GDP_df['GDP_Sweden'].copy()
# GDP_df.drop(['GDP_Finland','GDP_Norway','GDP_Sweden'], axis=1, inplace=True)
#GDP_df.set_index(['year'], inplace=True)
GDP_df = GDP_df.melt(id_vars=['year'], var_name=['country'], value_name='GDP')

# EDA:

## Holiday effect:
Sales are expected to rise with Holidays, so here I'll study the effect of Holidays through a Rolling window of 7 days to suppress the weekend effect. and see how it changes with regards to Holidays extracted from the Holiday dataset.

In [None]:
country = 'Finland'
store = 'KaggleMart'
product = 'Kaggle Mug'
year = 2016

ex_df  = df_train.query('country == @country and store == @store and product == @product and date.dt.year==@year')
ex_df.set_index(['date'],inplace=True)

holiday_dates = Holidays.query('Country == @country and Date.dt.year == @year')['Date']
moving_average = ex_df.rolling(
    window=7,       # 365-day window
    center=True,      # puts the average at the center of the window
    min_periods=4,  # choose about half the window size
).mean()

ax = ex_df.num_sold.plot(y='num_sold',style=".", color="0.5")
for date in holiday_dates:
    ax.axvline(date, color='red', alpha=0.6, linestyle = '--')

moving_average.plot(y='num_sold',
    ax=ax, linewidth=3, title="Weekly moving Average", legend=False,
);

It appears the effect of Holidays starts in a window after each Holiday. We may need to add extra features to encompass this.

## Seasonality Effect (Weekends...etc):

Before looking for any seasonality effects, a Periodogram might be useful:

In [None]:
#Bloc of code from Kaggle's Time-Series course. Modified to allow Periodogram to have different title and color

from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess

# annotations: https://stackoverflow.com/a/49238256/5769929
def seasonal_plot(X, y, period, freq, ax=None):
    if ax is None:
        _, ax = plt.subplots()
    palette = sns.color_palette("husl", n_colors=X[period].nunique(),)
    ax = sns.lineplot(
        x=freq,
        y=y,
        hue=period,
        data=X,
        ci=False,
        ax=ax,
        palette=palette,
        legend=False,
    )
    ax.set_title(f"Seasonal Plot ({period}/{freq})")
    for line, name in zip(ax.lines, X[period].unique()):
        y_ = line.get_ydata()[-1]
        ax.annotate(
            name,
            xy=(1, y_),
            xytext=(6, 0),
            color=line.get_color(),
            xycoords=ax.get_yaxis_transform(),
            textcoords="offset points",
            size=14,
            va="center",
        )
    return ax


def plot_periodogram(ts, detrend='linear', ax=None, title="Periodogram", color="purple", label=None):
    from scipy.signal import periodogram
    fs = pd.Timedelta("1Y") / pd.Timedelta("1D")
    freqencies, spectrum = periodogram(
        ts,
        fs=fs,
        detrend=detrend,
        window="boxcar",
        scaling='spectrum',
    )
    if ax is None:
        _, ax = plt.subplots()
    ax.step(freqencies, spectrum, color=color, label=label)
    ax.set_xscale("log")
    ax.set_xticks([1, 2, 4, 6, 12, 26, 52, 104])
    ax.set_xticklabels(
        [
            "Annual (1)",
            "Semiannual (2)",
            "Quarterly (4)",
            "Bimonthly (6)",
            "Monthly (12)",
            "Biweekly (26)",
            "Weekly (52)",
            "Semiweekly (104)",
        ],
        rotation=30,
    )
    ax.ticklabel_format(axis="y", style="sci", scilimits=(0, 0))
    ax.set_ylabel("Variance")
    ax.set_title(title)
    return ax


To account for country/store differences, I'll plot all country periodicities for each product (One store):

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(3, figsize=(15,15))

country = 'Norway'
store = 'KaggleRama'
product = 'Kaggle Mug'


ex_df  = df_train.query('country == @country and store == @store and product == @product')
ex_df.set_index(['date'],inplace=True)
plot_periodogram(ex_df.num_sold, ax=ax1, title="Periodogram: Kaggle Mug", label="Norway");

country = 'Finland'
ex_df  = df_train.query('country == @country and store == @store and product == @product')
ex_df.set_index(['date'],inplace=True)
plot_periodogram(ex_df.num_sold, ax=ax1, label="Finland", color="red", title="Periodogram: Kaggle Mug");

country = 'Sweden'
ex_df  = df_train.query('country == @country and store == @store and product == @product')
ex_df.set_index(['date'],inplace=True)
plot_periodogram(ex_df.num_sold, ax=ax1, label="Sweden", color="blue", title="Periodogram: Kaggle Mug");
ax1.legend()

country = 'Norway'
store = 'KaggleRama'
product = 'Kaggle Sticker'


ex_df  = df_train.query('country == @country and store == @store and product == @product')
ex_df.set_index(['date'],inplace=True)
plot_periodogram(ex_df.num_sold, ax=ax2, title="Periodogram: Kaggle Sticker", label="Norway");

country = 'Finland'
ex_df  = df_train.query('country == @country and store == @store and product == @product')
ex_df.set_index(['date'],inplace=True)
plot_periodogram(ex_df.num_sold, ax=ax2, label="Finland", color="red", title="Periodogram: Kaggle Sticker");

country = 'Sweden'
ex_df  = df_train.query('country == @country and store == @store and product == @product')
ex_df.set_index(['date'],inplace=True)
plot_periodogram(ex_df.num_sold, ax=ax2, label="Sweden", color="blue", title="Periodogram: Kaggle Sticker");
ax2.legend()

country = 'Norway'
store = 'KaggleRama'
product = 'Kaggle Hat'


ex_df  = df_train.query('country == @country and store == @store and product == @product')
ex_df.set_index(['date'],inplace=True)
plot_periodogram(ex_df.num_sold, ax=ax3, title="Periodogram: Kaggle Hat", label="Norway");

country = 'Finland'
ex_df  = df_train.query('country == @country and store == @store and product == @product')
ex_df.set_index(['date'],inplace=True)
plot_periodogram(ex_df.num_sold, ax=ax3, label="Finland", color="red", title="Periodogram: Kaggle Hat");

country = 'Sweden'
ex_df  = df_train.query('country == @country and store == @store and product == @product')
ex_df.set_index(['date'],inplace=True)
plot_periodogram(ex_df.num_sold, ax=ax3, label="Sweden", color="blue", title="Periodogram: Kaggle Hat");
ax3.legend()



Then vary the store for fixed country (To avoid cluttering the notebook, only one country was used; you can edit the notebook and change the country):

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(3, figsize=(15,15))

country = 'Sweden'
store = 'KaggleRama'
product = 'Kaggle Mug'

ex_df  = df_train.query('country == @country and store == @store and product == @product')
ex_df.set_index(['date'],inplace=True)
plot_periodogram(ex_df.num_sold, ax=ax1, title=f"Periodogram: Kaggle Mug ({country})", label="KaggleRama", color="blue");

store = 'KaggleMart'
ex_df  = df_train.query('country == @country and store == @store and product == @product')
ex_df.set_index(['date'],inplace=True)
plot_periodogram(ex_df.num_sold, ax=ax1, label="KaggleMart", color="red", title=f"Periodogram: Kaggle Mug ({country})");
ax1.legend()

store = 'KaggleRama'
product = 'Kaggle Sticker'

ex_df  = df_train.query('country == @country and store == @store and product == @product')
ex_df.set_index(['date'],inplace=True)
plot_periodogram(ex_df.num_sold, ax=ax2, title=f"Periodogram: Kaggle Sticker ({country})", label="KaggleRama", color="blue");

store = 'KaggleMart'
ex_df  = df_train.query('country == @country and store == @store and product == @product')
ex_df.set_index(['date'],inplace=True)
plot_periodogram(ex_df.num_sold, ax=ax2, label="KaggleMart", color="red", title=f"Periodogram: Kaggle Sticker ({country})");
ax2.legend()

store = 'KaggleRama'
product = 'Kaggle Hat'

ex_df  = df_train.query('country == @country and store == @store and product == @product')
ex_df.set_index(['date'],inplace=True)
plot_periodogram(ex_df.num_sold, ax=ax3, title=f"Periodogram: Kaggle Hat ({country})", label="KaggleRama", color="blue");

store = 'KaggleMart'
ex_df  = df_train.query('country == @country and store == @store and product == @product')
ex_df.set_index(['date'],inplace=True)
plot_periodogram(ex_df.num_sold, ax=ax3, label="KaggleMart", color="red", title=f"Periodogram: Kaggle Sticker ({country})");
ax3.legend()


The periodicity appears to vary significantly from country to country, from store to store and from product to product. Though the product to product periodicity appears to be the strongest and seems de-coupled from the country or store; The country/store combination should be taken into account. (Thanks ambrosM)

In [None]:
X = ex_df

X["day"] = X.index.dayofweek  # the x-axis (freq)
X["week"] = X.index.week  # the seasonal period (period)

# days within a year
X["dayofyear"] = X.index.dayofyear
X["year"] = X.index.year
fig, (ax0, ax1) = plt.subplots(2, 1, figsize=(11, 6))
seasonal_plot(X, y="num_sold", period="week", freq="day", ax=ax0)
seasonal_plot(X, y="num_sold", period="year", freq="dayofyear", ax=ax1);

Sold unit numbers vary with weekdays (predictibly) so encoding Weekdays/Weekends should be done. There is a yearly pattern as well as found from the Periodogram.

Conclusion:

+ Add a x-day window after each Holiday. (Probably by augmenting the Holiday Dataframe itself)
+ Encode weekdays.
+ Encode a bimonthly/monthly Fourier Feature tied to each product at a time and each store/country combination.

## Distribution of num_sold:

In [None]:
sns.displot(df_train['num_sold'])
sns.displot(np.log1p(df_train['num_sold']))

Apparently the distribution of `num_sold` is not Gaussian of its own. Log-Transforming it makes it behave better.

# Feature Engineering:

Before doing any feature engineering on time features we should one-hot encode the country store and product as well as adding a GDP column.

Now we need to change the Holiday dataset by adding a 7-day window after each Holiday for its effect to be pronounced.

In [None]:
def add_holiday_window(holiday_df, window=7):
    hol_df = holiday_df.copy()
    for year in [2015,2016,2017,2018,2019]:
        for country in ['Norway','Finland']:
            date = pd.Timestamp(f"{year}-12-31")
            LocalName = 'na'
            Name = 'New Year\'s Eve'
            Fixed = True
            row = pd.Series({'Date':date, 'Country':country, 'LocalName':LocalName, 'Name':Name, 'Fixed':Fixed})
            hol_df = hol_df.append(row, ignore_index=True)
    hol_df.sort_values(['Country','Date'], axis=0, inplace=True, ignore_index=True)
    hol_dfiter = hol_df.copy()
    
    hol_dfiter.drop(["LocalName","Fixed"], axis=1, inplace=True)
    hol_df.drop(["LocalName","Fixed"], axis=1, inplace=True)
    
    hol_dfiter['next_date'] = hol_dfiter['Date'].shift(-1)
    for index, row in hol_dfiter.iterrows():
        date = row['Date']
        next_date = row['next_date']
        if next_date == pd.Timestamp('2015-01-01'):
            next_date = pd.Timestamp('2020-01-05')
        
        delta_date = (next_date - date).days - 1

        delta_date = window if math.isnan(delta_date) or delta_date>(window-1) else delta_date
        name = row['Name']
        country = row['Country']
        
        new_df= pd.DataFrame()
        new_df['Date'] = [date + pd.Timedelta(days=deltaday) for deltaday in range(1,delta_date+1)]
        new_df['Country'] = [country for _ in range(1,delta_date+1)]
        new_df['Name'] = [f"{name}{deltaday}" for deltaday in range(1,delta_date+1)]

        hol_df = hol_df.append(new_df, ignore_index=True) # DO NOT REMOVE DUPLICATES
    hol_df.sort_values(['Date'], axis=0, inplace=True, ignore_index=True)
    #hol_df.drop(range(1057,1064),axis=0, inplace=True) #Remove the last values for Sweden for 2020
    hol_df.columns = ['date','country','day']
    #Add Dummies:
    #hol_df = pd.get_dummies(hol_df, columns=['country'], drop_first=True)
    hol_df = pd.get_dummies(hol_df, columns=['day'], drop_first=False)
    return hol_df

Before one-hot encoding, it may be a good idea to do Target Encoding on the country/store/product combination. This will be done through a cross-fold trick (Got it from the Feature Engineering course). (See hidden cell below)

In [None]:
class CrossFoldEncoder:
    def __init__(self, encoder, **kwargs):
        self.encoder_ = encoder
        self.kwargs_ = kwargs  # keyword arguments for the encoder
        self.cv_ = KFold(n_splits=5)

    # Fit an encoder on one split and transform the feature on the
    # other. Iterating over the splits in all folds gives a complete
    # transformation. We also now have one trained encoder on each
    # fold.
    def fit_transform(self, X, y, cols):
        self.fitted_encoders_ = []
        self.cols_ = cols
        X_encoded = []
        for idx_encode, idx_train in self.cv_.split(X):
            fitted_encoder = self.encoder_(cols=cols, **self.kwargs_)
            fitted_encoder.fit(
                X.iloc[idx_encode, :], y.iloc[idx_encode],
            )
            X_encoded.append(fitted_encoder.transform(X.iloc[idx_train, :])[cols])
            self.fitted_encoders_.append(fitted_encoder)
        X_encoded = pd.concat(X_encoded)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded

    # To transform the test data, average the encodings learned from
    # each fold.
    def transform(self, X):
        from functools import reduce

        X_encoded_list = []
        for fitted_encoder in self.fitted_encoders_:
            X_encoded = fitted_encoder.transform(X)
            X_encoded_list.append(X_encoded[self.cols_])
        X_encoded = reduce(
            lambda x, y: x.add(y, fill_value=0), X_encoded_list
        ) / len(X_encoded_list)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded

Then we encode Fourier features and week encodings for the countrystore combinations as well as products.

In [None]:
def fourier_encoding(df_train, drop_common=True): #Returns encodings for countrystore combinations and products.
    fourier = CalendarFourier(freq="A", order=8)
    
    dp = DeterministicProcess(
        index=df_train.date.unique(),
        constant=True,               # dummy feature for bias (y-intercept)
        order=2,                     # trend (order 1 means linear)
        seasonal=True,               # weekly seasonality (indicators)
        additional_terms=[fourier],  # annual seasonality (fourier)
        drop=True,                   # drop terms to avoid collinearity
    )
    train_encoding = dp.in_sample()
    train_encoding['date'] = train_encoding.index
    train_encoding = df_train.merge(train_encoding,on="date",how="left");
    train_encoding.drop(['num_sold','year','GDP','num_sold'], axis=1, inplace=True)
    train_encoding['countrystore'] = train_encoding['country'].astype(str) + train_encoding['store'].astype(str)
    train_encoding = train_encoding.merge(pd.get_dummies(train_encoding, columns=['countrystore','product'], drop_first=False), how="left")
    encoded_cols = train_encoding.columns.tolist()[5:30].copy()
    for encoded_col in encoded_cols:
        for country in train_encoding['country'].unique():
            for store in train_encoding['store'].unique():
                train_encoding[f"{encoded_col}_{country}{store}"] = train_encoding[f"countrystore_{country}{store}"] * train_encoding[encoded_col]
        for product in train_encoding['product'].unique():
            train_encoding[f"{encoded_col}_{product}"] = train_encoding[f"product_{product}"] * train_encoding[encoded_col]
    cols_to_drop = train_encoding.columns.tolist()[30:40]
    train_encoding.drop(cols_to_drop, axis=1, inplace=True) #Drop countrystore/product encodings.
    if drop_common: #If we want to drop the common columns
        cols_to_drop = train_encoding.columns.tolist()[5:30]
        train_encoding.drop(cols_to_drop, axis=1, inplace=True) #Drop countrystore/product encodings.
    return train_encoding

In [None]:
from  category_encoders.cat_boost import CatBoostEncoder
lam=0.00
def process_data(df, df_test=None):
    df = df.copy()
    if type(df_test) != type(None):
        df_test = df_test.copy()
        df_test['num_sold'] = 0
        df = df.append(df_test)
        
    #Step 1: GDP:
    
    df['year'] = df.date.dt.year
    #df['week'] = df.date.dt.week
    df = df.merge(GDP_df, on=['country','year'], how="left")
    #Step 2: Encoding Fourier features and week features
    encoding = fourier_encoding(df)
    df = df.merge(encoding, how="left")

    #Step 3: Holidays:
    hol_df = add_holiday_window(Holidays, window=9)
    df = df.merge(hol_df, on=["date","country"], how="left");
    
    #Filling missing values with 0, due to left join:
    df = df.fillna(0)

    #Step 4: Log-transforming num_sold:
    df['num_sold'] =boxcox1p(df['num_sold'],lam)
    
    #Starting here, the train-validation split must be done to avoid data leakage during Target Encoding.
    if type(df_test) != type(None):
        df_test = df.query("year==2019")
        df_test.drop(['row_id','num_sold'], axis=1, inplace=True)
    
    
    df_val = df.query("year==2018")
    df = df.query("year < 2018")
    #df, df_val = df.loc[:19727], df.loc[19728:]
    
    y_train = df.pop('num_sold')
    y_val   = df_val.pop('num_sold')
    
    df.drop(['row_id'], axis=1, inplace=True)
    df_val.drop(['row_id'], axis=1, inplace=True)
    
    #Step 4: Target Encoding: 
    df['CSP'] = df['country'].astype(str) + df['store'].astype(str) + df['product'].astype(str) #+ df['week'].astype(str) #Combine for encoding
    df_val['CSP'] = df_val['country'].astype(str) + df_val['store'].astype(str) + df_val['product'].astype(str) #+ df['week'].astype(str)
    
    country_store_product_cols = ["CSP"]
    csp_encoder = CrossFoldEncoder(CatBoostEncoder, a=10)
    train_csp_encoded = csp_encoder.fit_transform(df, y_train, cols=country_store_product_cols)
    val_csp_encoded = csp_encoder.transform(df_val)
    
    df.drop(['CSP'], axis=1, inplace=True)
    df_val.drop(['CSP'], axis=1, inplace=True)
    
    df['CSP_encoded'] = train_csp_encoded
    df_val['CSP_encoded'] = val_csp_encoded
    
    prod_encoder = CrossFoldEncoder(CatBoostEncoder, a=10)
    train_prod_encoded = prod_encoder.fit_transform(df, y_train, cols=['product']) * 1e4#/ df.GDP
    val_prod_encoded = prod_encoder.transform(df_val)* 1e4
    
    df['prod_encoded'] = train_prod_encoded['product_encoded'] #/ df.GDP
    df_val['prod_encoded'] = val_prod_encoded['product_encoded'] #/ df_val.GDP
    
    if type(df_test) != type(None):
        df_test['CSP'] = df_test['country'].astype(str) + df_test['store'].astype(str) + df_test['product'].astype(str) #+ df['week'].astype(str)
        test_csp_encoded = csp_encoder.transform(df_test)
        df_test.drop(['CSP'], axis=1, inplace=True)
        df_test['CSP_encoded'] = test_csp_encoded
        test_prod_encoded = prod_encoder.transform(df_test)* 1e4
        df_test['prod_encoded'] = test_prod_encoded['product_encoded'] #/ df_test.GDP
    #TODO: Perhaps encode product alone and divide it by GDP?
    
    #Step5: One-hot Encoding:
    if type(df_test) != type(None):
        df_test = pd.get_dummies(df_test, columns=['country','store','product'], drop_first=True)
        df_test.drop(['date'], axis=1, inplace=True)
        
    df = pd.get_dummies(df, columns=['country','store','product'], drop_first=True)
    df_val = pd.get_dummies(df_val, columns=['country','store','product'], drop_first=True)

    df.drop(['date'], axis=1, inplace=True)
    df_val.drop(['date'], axis=1, inplace=True)
    
    if type(df_test) == type(None):
        df_test = pd.DataFrame(columns=['year']) #A trick to avoid issues in training
    
    return df, df_val, y_train, y_val, df_test, csp_encoder   



# Training:
As per ambrosM's advice, I've implemented a small bloc of code that clears away variables with too low coefficients and returns a model based on the reduced variable set.

In [None]:
from sklearn.linear_model import Ridge
def SMAPE(y_true, y_pred):
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.round(np.mean(diff),5)

def remove_parameters_iter(df_train, df_test=None, n=1, eps=1e-12):
    train_x, val_x, train_y, val_y, test_x, csp_encoder = process_data(df_train, df_test=df_test)
    orig_train_x, orig_val_x, orig_test_x = train_x.copy(), val_x.copy(), test_x.copy()
    train_x.drop('year', axis=1, inplace=True)
    val_x.drop('year', axis=1, inplace=True)
    test_x.drop('year', axis=1, inplace=True)
    model = Ridge(alpha=1e-4,fit_intercept=False)
    model.fit(train_x, train_y)
    prev_score = 100
    dropped_columns = []
    ret = (model, train_x.copy(), val_x.copy(), train_y, val_y, test_x, orig_test_x, csp_encoder, dropped_columns, orig_train_x, orig_val_x)
    
    for _ in range(n):
        bad_columns = [train_x.columns.tolist()[ind] for ind in list(np.where(model.coef_<eps)[0])]
        dropped_columns = dropped_columns + bad_columns
        train_x.drop(bad_columns, axis=1, inplace=True)
        val_x.drop(bad_columns, axis=1, inplace=True)
        test_x.drop(bad_columns, axis=1, inplace=True)
        model = Ridge(alpha=1e-4,fit_intercept=False)
        model.fit(train_x, train_y)
        y_pred = model.predict(val_x)
        score_ = SMAPE(inv_boxcox1p(val_y,lam), inv_boxcox1p(y_pred,lam))
        print(score_)
        if score_ >= prev_score:
            print("Max precision attained")
            if score_ > prev_score:
                return prev
            break
        prev = (model, train_x.copy(), val_x.copy(), train_y, val_y, test_x.copy(), orig_test_x, csp_encoder, dropped_columns, orig_train_x, orig_val_x)
        prev_score = score_
        
    return model, train_x, val_x, train_y, val_y, test_x, orig_test_x, csp_encoder, dropped_columns, orig_train_x, orig_val_x

In [None]:
df_test = pd.read_csv("/kaggle/input/tabular-playground-series-jan-2022/test.csv",parse_dates=["date"])

In [None]:
model, train_x, val_x, train_y, val_y, test_x, orig_test_x, csp_encoder, dropped_columns, orig_train_x, orig_val_x = remove_parameters_iter(df_train, df_test=df_test, n=5, eps=1e-5)
#train_x = remove_parameters_iter(df_train, df_test=df_test, n=5, eps=1e-5)

In [None]:
sns.displot(train_y)

In [None]:
ex_df = orig_val_x.query("country_Norway==1 and country_Sweden==0 and store_KaggleRama==0 and `product_Kaggle Mug`==1")
ex_df_y = pd.DataFrame()
ex_df_y['y'] = np.expm1(val_y.loc[ex_df.index])

ex_df_y['date'] = pd.date_range(start='1/1/2018', end='31/12/2018')

ex_df.drop(dropped_columns, axis=1, inplace=True)
ex_df.drop('year', axis=1, inplace=True)

predicted_y = model.predict(ex_df)
predicted_y = pd.Series(predicted_y)
plot_df = pd.DataFrame()
plot_df['y'] = np.expm1(predicted_y)
plot_df['date'] = pd.date_range(start='1/1/2018', end='31/12/2018')

delta_y = pd.DataFrame()
delta_y['y'] = np.abs(np.array(plot_df.y) - np.array(ex_df_y.y))
delta_y['date'] = plot_df['date']
ax = ex_df_y.plot(x = 'date', y='y', style=".", color="0.5")
delta_y.plot(x ='date', y='y', ax=ax,
      linewidth=3, legend=False,
);
plot_df.plot(x ='date', y='y', ax=ax,
      linewidth=3, title="Weekly moving Average", legend=False,
);

In [None]:
country = 'Sweden'
year = 2018

delta_y.set_index('date', inplace=True)
holiday_dates = Holidays.query('Country == @country and Date.dt.year == @year')['Date']

moving_average = delta_y.rolling(
    window=7,       # 365-day window
    center=True,      # puts the average at the center of the window
    min_periods=4,  # choose about half the window size
).mean()

ax = moving_average.plot(y='y', #ax=ax,
      linewidth=3, legend=False,
);
for date in holiday_dates:
    ax.axvline(date, color='red', alpha=0.6, linestyle = '--')


In [None]:
# y_pred = model.predict(val_x)
# SMAPE(inv_boxcox1p(val_y,lam), inv_boxcox1p(y_pred,lam))

It seems that the error is not too bad and the values only need some tweaking. So an XGBoost finetuning model will be implemented on the un-trimmed dataset on the residuals.

# Hybrid Model:

In [None]:
# import lightgbm as lgb'

# def train_residuals(train_x, orig_train_x, train_y):
#     lin_model_result = model.predict(train_x)
#     residual = np.expm1(train_y) - np.expm1(lin_model_result)
    


In [None]:
# residual_model = train_residuals(train_x, orig_train_x, train_y)

In [None]:
# y_pred = predict_total(orig_val_x)
# SMAPE(np.expm1(val_y), y_pred)

In [None]:
# def predict_total(orig_x,dropped_columns=dropped_columns):
#     x = orig_x.drop(dropped_columns, axis=1)
#     x.drop('year', axis=1, inplace=True)
#     trended_result = model.predict(x)
#     finetuned_result = residual_model.predict(orig_x)
#     transformed_result = np.expm1(trended_result) + finetuned_result
#     return np.round(transformed_result)

In [None]:
# y_pred = predict_total(orig_val_x)
# SMAPE(np.expm1(val_y), y_pred)

In [None]:
# ex_df = orig_val_x.query("country_Norway==1 and country_Sweden==0 and store_KaggleRama==0 and `product_Kaggle Mug`==1")
# ex_df_y = pd.DataFrame()
# ex_df_y['y'] = np.expm1(val_y.loc[ex_df.index])

# ex_df_y['date'] = pd.date_range(start='1/1/2018', end='31/12/2018')

# # ex_df.drop(dropped_columns, axis=1, inplace=True)
# # ex_df.drop('year', axis=1, inplace=True)

# predicted_y = predict_total(ex_df)
# predicted_y = pd.Series(predicted_y)
# plot_df = pd.DataFrame()
# plot_df['y'] = predicted_y
# plot_df['date'] = pd.date_range(start='1/1/2018', end='31/12/2018')

# delta_y = pd.DataFrame()
# delta_y['y'] = np.abs(np.array(plot_df.y) - np.array(ex_df_y.y))
# delta_y['date'] = plot_df['date']
# ax = ex_df_y.plot(x = 'date', y='y', style=".", color="0.5")
# delta_y.plot(x ='date', y='y', ax=ax,
#       linewidth=3, legend=False,
# );
# plot_df.plot(x ='date', y='y', ax=ax,
#       linewidth=3, title="Weekly moving Average", legend=False,
# );

# Testing:

In [None]:
def test_model(test_x, final=False):
    id_col = df_test['row_id']
    predicted_y = model.predict(test_x)
    ret_df = pd.DataFrame()
    ret_df['row_id'] = id_col
    ret_df['num_sold'] = np.round(inv_boxcox1p(predicted_y,lam))
    df_plot = df_test.merge(ret_df, how="left", on="row_id")
    return df_plot, ret_df

In [None]:
df_plot, ret_df = test_model(test_x, final=True)

In [None]:
ret_df

In [None]:
country = 'Finland'
store = 'KaggleMart'
product = 'Kaggle Mug'

ex_df  = df_plot.query('country == @country and store == @store and product == @product')
ex_df.set_index(['date'],inplace=True)

holiday_dates = Holidays.query('Country == @country and Date.dt.year == 2019')['Date']
moving_average = ex_df.rolling(
    window=7,       # 365-day window
    center=True,      # puts the average at the center of the window
    min_periods=4,  # choose about half the window size
).mean()

ax = ex_df.num_sold.plot(y='num_sold', color="0.5")
for date in holiday_dates:
    ax.axvline(date, color='red', alpha=0.6, linestyle = '--')

moving_average.plot(y='num_sold',
    ax=ax, linewidth=3, title="Weekly moving Average", legend=False,
)

In [None]:
ret_df.to_csv('submission.csv',index=False)