#### In this notebook I extend my previous EDA and linear model to an XGBoost model and I optimize it for SMAPE using Optuna.

##### We first load the data, it is necessary paying attention to convert the date into datetime

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn
import dateutil.easter as easter
from xgboost import XGBRegressor
import optuna
from optuna.integration import XGBoostPruningCallback

In [None]:
# Loading train and test data
train = pd.read_csv("../input/tabular-playground-series-jan-2022/train.csv", parse_dates=['date'])
test = pd.read_csv("../input/tabular-playground-series-jan-2022/test.csv", parse_dates=['date'])

In [None]:
train.dtypes

##### Before starting with EDA, it is important to check about the data structure. Apparently we have a combination of time series based on countries, stores and products. Let's first check if all the combinations appear in train and test.

In [None]:
# figuring out the theoretically possible level combination
time_series = ['country', 'store', 'product']
combinations = 1
for feat in time_series:
    combinations *= train[feat].nunique()
    
print(f"There are {combinations} possible combinations")

In [None]:
time_series = ['country', 'store', 'product']
country_store_product_train = train[time_series].drop_duplicates().sort_values(time_series)
country_store_product_test =test[time_series].drop_duplicates().sort_values(time_series)

cond_1 = len(country_store_product_train) == combinations
print(f"Are all theoretical combinations present in train: {cond_1}")
cond_2 = (country_store_product_train == country_store_product_test).all().all()
print(f"Are combinations the same in train and test: {cond_2}")

##### As a second step let's visualize how time is split between train and test.

In [None]:
train_dates = train.date.drop_duplicates().sort_values()
test_dates = test.date.drop_duplicates().sort_values()

fig, ax = plt.subplots(1, 1, figsize = (11, 7))
cmap_cv = plt.cm.coolwarm

color_index = np.array([1] * len(train_dates) + [0] * len(test_dates))

ax.scatter(range(len(train_dates)), [.5] * len(train_dates),
           c=color_index[:len(train_dates)], marker='_', lw=15, cmap=cmap_cv,
           label='train', vmin=-.2, vmax=1.2)

ax.scatter(range(len(train_dates), len(train_dates) + len(test_dates)), [.55] * len(test_dates),
           c=color_index[len(train_dates):], marker='_', lw=15, cmap=cmap_cv,
           label='test', vmin=-.2, vmax=1.2)

tick_locations = np.cumsum([0, 365, 366, 365, 365, 365])
for i in (tick_locations):
    ax.vlines(i, 0, 2,linestyles='dotted', colors = 'grey')
    
ax.set_xticks(tick_locations)
ax.set_xticklabels([2015, 2016, 2017, 2018, 2019, 2020], rotation = 0)
ax.set_yticklabels(labels=[])
plt.ylim([0.45, 0.60])
ax.legend(loc="upper left", title="data")

plt.show()

##### Having four complete years available allows various types of testing and modelling. In this EDA we will limit to use the last year available (2018) as an hold-out, setting our baseline model to be able to forecast an entire year in the future.

##### As a last check we verify that no date is missing from train and test:

In [None]:
missing_train = pd.date_range(start=train_dates.min(), end=train_dates.max()).difference(train_dates)
missing_test = pd.date_range(start=test_dates.min(), end=test_dates.max()).difference(test_dates)
print(f"missing dates in train: {len(missing_train)} and in test: {len(missing_test)}")

##### Having completed the checks, we process the datetime information, extracting it informative elements at different time granularities:

In [None]:
# We create different time granularity

def process_time(df):
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['week'] = df['date'].dt.isocalendar().week
    df['week'][df['week']>52] = 52
    df['day'] = df['date'].dt.day
    df['dayofweek'] = df['date'].dt.dayofweek
    df['quarter'] = df['date'].dt.quarter
    df['dayofyear'] = df['date'].dt.dayofyear
    # leap year correction
    df.loc[(df.date.dt.is_leap_year) & (df.dayofyear >= 60),'dayofyear'] -= 1
    return df

train = process_time(train)
test = process_time(test)

##### We are ready to explore the data. In order to highlight the time series characteristics, we create panels of products x countries x shops. We start by aggregating at a year level.

In [None]:
for product in ['Kaggle Mug', 'Kaggle Hat', 'Kaggle Sticker']:
    print(f"\n--- {product} ---\n")
    fig = plt.figure(figsize=(20, 10), dpi=100)
    fig.subplots_adjust(hspace=0.25)
    for i, store in enumerate(['KaggleMart', 'KaggleRama']):
        for j, country in enumerate(['Finland', 'Norway', 'Sweden']):
            ax = fig.add_subplot(2, 3, (i*3+j+1))
            selection = (train['country']==country)&(train['store']==store)&(train['product']==product)
            selected = train[selection]
            selected.set_index('date').groupby('year')['num_sold'].mean().plot(ax=ax)
            ax.set_title(f"{country}:{store}")
    plt.show()

##### The first series of panels points out that the country effect is kind of indipendent from store and product. There is an underlying country dynamic that replicates the same no matter the shop or the product sold by it. We also notice that shops differentiate only for the level of sales.

##### Our next panels will explore seaasonality based on months:

In [None]:
for product in ['Kaggle Mug', 'Kaggle Hat', 'Kaggle Sticker']:
    fig = plt.figure(figsize=(20, 10), dpi=100)
    fig.subplots_adjust(hspace=0.25)
    for i, store in enumerate(['KaggleMart', 'KaggleRama']):
        for j, country in enumerate(['Finland', 'Norway', 'Sweden']):
            ax = fig.add_subplot(2, 3, (i*3+j+1))
            selection = (train['country']==country)&(train['store']==store)&(train['product']==product)
            selected = train[selection]
            for year in [2015, 2016, 2017, 2018]:
                selected[selected.year==year].set_index('date').groupby('month')['num_sold'].mean().plot(ax=ax, label=year)
            ax.set_title(f"{product} | {country}:{store}")
            ax.legend()
    plt.show()

##### Here we notice two important elements: seanality curves are different for each product and they also differ from year to year. Averaging the curves probably is safe bet for the future, as well as considering more relevant the recent years (thus weighting more the year 2018 for instance). For the sticker product, year 2017 seems particularly different from others.

##### We now proceed to examine seasonality even more in detail at a week level:

In [None]:
for product in ['Kaggle Mug', 'Kaggle Hat', 'Kaggle Sticker']:
    print(f"\n--- {product} ---\n")
    fig = plt.figure(figsize=(20, 10), dpi=100)
    fig.subplots_adjust(hspace=0.25)
    for i, store in enumerate(['KaggleMart', 'KaggleRama']):
        for j, country in enumerate(['Finland', 'Norway', 'Sweden']):
            ax = fig.add_subplot(2, 3, (i*3+j+1))
            selection = (train['country']==country)&(train['store']==store)&(train['product']==product)
            selected = train[selection]
            for year in [2015, 2016, 2017, 2018]:
                selected[selected.year==year].set_index('date').groupby('week')['num_sold'].mean().plot(ax=ax, label=year)
            ax.set_title(f"{country}:{store}")
            ax.legend()
    plt.show()

##### At a week level we see that differences are due to peaks. Peaks seem different in Spring. Probably is is Easter effect.

##### We now start obeserving recurrences at a monthly level:

In [None]:
for product in ['Kaggle Mug', 'Kaggle Hat', 'Kaggle Sticker']:
    print(f"\n--- {product} ---\n")
    fig = plt.figure(figsize=(20, 10), dpi=100)
    fig.subplots_adjust(hspace=0.25)
    for i, store in enumerate(['KaggleMart', 'KaggleRama']):
        for j, country in enumerate(['Finland', 'Norway', 'Sweden']):
            ax = fig.add_subplot(2, 3, (i*3+j+1))
            selection = (train['country']==country)&(train['store']==store)&(train['product']==product)
            selected = train[selection]
            for year in [2015, 2016, 2017, 2018]:
                selected[selected.year==year].set_index('date').groupby('day')['num_sold'].mean().plot(ax=ax, label=year)
            ax.set_title(f"{country}:{store}")
            ax.legend()
    plt.show()

##### The middle of the month usually presents less sales. The peak at the end may be influenced by seasonal peaks (end of year).

##### And we completed by inspecting at a day of the week level:

In [None]:
for product in ['Kaggle Mug', 'Kaggle Hat', 'Kaggle Sticker']:
    print(f"\n--- {product} ---\n")
    fig = plt.figure(figsize=(20, 10), dpi=100)
    fig.subplots_adjust(hspace=0.25)
    for i, store in enumerate(['KaggleMart', 'KaggleRama']):
        for j, country in enumerate(['Finland', 'Norway', 'Sweden']):
            ax = fig.add_subplot(2, 3, (i*3+j+1))
            selection = (train['country']==country)&(train['store']==store)&(train['product']==product)
            selected = train[selection]
            for year in [2015, 2016, 2017, 2018]:
                selected[selected.year==year].set_index('date').groupby('dayofweek')['num_sold'].sum().plot(ax=ax, label=year)
            ax.set_title(f"{country}:{store}")
            ax.legend()
    plt.show()

##### Friday and the week-end are the best days, but Sundays are not always at the same level as Saturdays (it depends on the year - why?).

##### Based on the information we got we now proceed to feature engineering and to enrich the data (using festivities and GDP data).

In [None]:
festivities = pd.read_csv("../input/festivities-in-finland-norway-sweden-tsp-0122/nordic_holidays.csv",
                          parse_dates=['date'],
                          usecols=['date', 'country', 'holiday'])

In [None]:
gdp = pd.read_csv("../input/gdp-20152019-finland-norway-and-sweden/GDP_data_2015_to_2019_Finland_Norway_Sweden.csv")
gdp = np.concatenate([gdp[['year', 'GDP_Finland']].values, 
                      gdp[['year', 'GDP_Norway']].values, 
                      gdp[['year', 'GDP_Sweden']].values])
gdp = pd.DataFrame(gdp, columns=['year', 'gdp'])
gdp['country'] = ['Finland']*5 + ['Norway']*5 +['Sweden']*5

##### We now process the data and scale it. Since EDA revealed how the different characteristics of the series are mostly main effects (country and store), we focus on finding the way to model the interaction between products and time.

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

def process_data(df):
    
    processed = dict()
    processed['row_id'] = df['row_id']
    
    print("creating dummies for main effects of time, country, store and product")
    to_dummies = ['country', 'store', 'product']
    for feat in to_dummies:
        tmp = pd.get_dummies(df[feat])
        for col in tmp.columns:
            processed[feat+'_'+str(col)] = tmp[col]
    processed['wd4'] = (df.date.dt.weekday == 4).astype(int)
    processed['wd56'] = (df.date.dt.weekday >= 5).astype(int)
    
    print("modelling time as continuous")
    processed['prog'] = ((df.row_id // 18) + 1)
    
    print("modelling time as cyclic")
    for time_measure in ['quarter', 'month', 'week', 'day', 'dayofyear', 'dayofweek']:
        processed[time_measure] = df[time_measure]
    
    print("adding country gdp")
    gdp_exponent = 1.2121103201489674 # see https://www.kaggle.com/ambrosm/tpsjan22-03-linear-model for an explanation
    gdp_countries = df.merge(gdp, on=['country', 'year'], how='left')['gdp'].values
    processed['gdp'] = gdp_countries * gdp_exponent
    
    print("creating dummies and halo effect for Nordic holidays")
    tmp = pd.get_dummies(
            df.merge(festivities, on=['date', 'country'], how='left').sort_values('row_id')['holiday'])
    for col in tmp.columns:
        processed['holiday_' + str(col)] =  tmp[col].values
        halo = np.zeros(len(df))
        dates = df[(tmp[col]==1).values].date.unique()
        for date in dates:
            year = date.astype('datetime64[Y]').astype(int) + 1970
            halo[df.year.values==year] += (df.date[df.year==year] - date).dt.days.clip(lower=-14, upper=14).values 
        processed['holiday_halo_' + str(col)] = halo
    
    # Christmas
    xmas_date = df.date.dt.year.apply(lambda year: pd.Timestamp(str(year)+'-12-25'))
    processed['xmas_adjust'] = (df.date - xmas_date).dt.days.clip(lower=-20,upper=6)
        
    # New Year 
    processed['newyear_adjust1'] = df.dayofyear.clip(lower=0,upper=10)
    processed['newyear_adjust2'] = df.dayofyear.clip(lower=0,upper=2)
    
    # Easter
    easter_date = df.date.apply(lambda date: pd.Timestamp(easter.easter(date.year)))
    processed['easter_adj'] = (df.date - easter_date).dt.days.clip(lower =-3,upper = 12)
    
    # Last Wednesday of June
    wed_june_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-06-24')),
                                         2016: pd.Timestamp(('2016-06-29')),
                                         2017: pd.Timestamp(('2017-06-28')),
                                         2018: pd.Timestamp(('2018-06-27')),
                                         2019: pd.Timestamp(('2019-06-26'))})
    processed['days_from_wed_jun'] = (df.date - wed_june_date).dt.days.clip(-5, 5)
    
    #First Sunday of November (second Sunday is Father's Day)
    sun_nov_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-11-1')),
                                         2016: pd.Timestamp(('2016-11-6')),
                                         2017: pd.Timestamp(('2017-11-5')),
                                         2018: pd.Timestamp(('2018-11-4')),
                                         2019: pd.Timestamp(('2019-11-3'))})
    processed['days_from_sun_nov'] = (df.date - sun_nov_date).dt.days.clip(-1, 9)
    
    print(f"completed processing {len(processed)-1} features")

    values = list()
    columns = list()
    for key, value in processed.items():
        value = np.array(value).astype(np.float32)
        values.append(value)
        columns.append(key)
        
    values = np.array(values).T
    print(values.shape, values[1].shape, len(columns))
    
    processed = pd.DataFrame(values, columns=columns)
    
    print("resorting row ids")
    processed = processed.sort_values('row_id').set_index('row_id')
    return processed

def process_target(df):
    target = pd.DataFrame({'row_id':df['row_id'], 'num_sold':df['num_sold']})
    target = target.sort_values('row_id').set_index('row_id')
    return target

train_test = process_data(train.append(test))

processed_train = train_test.iloc[:len(train)].copy()
processed_test = train_test.iloc[len(train):].copy()

target = np.ravel(process_target(train))

In [None]:
processed_train.shape, train.shape, processed_test.shape, test.shape

##### We prepare all the evaluation measures, both at an aggregate level, with exp transformation and at an individual cases level (for error analysis)

In [None]:
def SMAPE(y_true, y_pred):
    # From https://www.kaggle.com/cpmpml/smape-weirdness
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)

def SMAPE_exp(y_true, y_pred):
    y_true = np.exp(y_true)
    y_pred = np.exp(y_pred)
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)

def SMAPE_err(y_true, y_pred):
    # From https://www.kaggle.com/cpmpml/smape-weirdness
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return diff

def selective_rounding(preds, lower=0.3, upper=0.7):
    # selective rounding
    dec = preds % 1
    to_round = (dec<=lower)|(dec>=upper)
    preds[to_round] = np.round(preds[to_round])
    return preds

def weighting(df, weights):
    return df.year.replace(weights).values
    
weights = weighting(train, {2015:0.125, 2016:0.25, 2017:0.5, 2018:1})

In [None]:
def objective(trial):
    
    params = {
            'learning_rate': trial.suggest_float("learning_rate", 1e-4, 1.0, log=True),
            'reg_lambda': trial.suggest_loguniform("reg_lambda", 1e-8, 100.0),
            'reg_alpha': trial.suggest_loguniform("reg_alpha", 1e-8, 100.0),
            'subsample': trial.suggest_float("subsample", 0.1, 1.0),
            'colsample_bytree': trial.suggest_float("colsample_bytree", 0.1, 1.0),
            'max_depth': trial.suggest_int("max_depth", 1, 9),
            'min_child_weight': trial.suggest_int("min_child_weight", 1, 25),
            'gamma': trial.suggest_float("gamma", 1e-8, 1.0, log=True),
            'grow_policy': trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"]),
    }
    
    model = XGBRegressor(
        random_state=0,
        tree_method="gpu_hist",
        predictor="gpu_predictor",
        n_estimators=10_000,
        **params,
    )
    
    smape = list()
    
    for year in train.year.unique():
        
        train_set = list(train.row_id[train.year!=year])
        val_set = list(train.row_id[train.year==year])

        x = processed_train.iloc[train_set]
        x_test = processed_train.iloc[val_set]
        y = target[train_set]
        y_test = target[val_set]
    
        model.fit(x, y, verbose=100)

        preds = model.predict(x_test)
        smape.append(SMAPE(y_test, preds))
    
    return np.mean(smape)

In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=60)

In [None]:
print(f"Best SMAPE on holdout data: {study.best_value:0.5f}")

In [None]:
print("best parameters:")
print(study.best_params)

##### We now use the XGBRegressor with its best parameters to train on various yearly subsets of all data and to predict on the test set.

In [None]:
params = study.best_params
test_preds = np.zeros(len(processed_test))
years = train.year.unique()

# train on train yearly subsets
for year in years:
    train_set = list(train.row_id[train.year!=year])
    val_set = list(train.row_id[train.year==year])

    x = processed_train.iloc[train_set]
    x_test = processed_train.iloc[val_set]
    y = target[train_set]
    y_test = target[val_set]
    
    model = XGBRegressor(random_state=0,
                     tree_method="gpu_hist",
                     predictor="gpu_predictor",
                     n_estimators=10_000,
                     **params,)

    model.fit(x, y)
    preds = model.predict(x_test)
    smape = SMAPE(y_test, preds)
    
    print(f"SMAPE year=={year} : {smape:0.5f}")
    
    test_preds += model.predict(processed_test) / len(years)
    
# train on all the data
model = XGBRegressor(random_state=0,
                     tree_method="gpu_hist",
                     predictor="gpu_predictor",
                     n_estimators=10_000,
                     **params,)

model.fit(processed_train, target)

# blending
test_preds = test_preds * 0.5 + model.predict(processed_test) * 0.5

In [None]:
submission = pd.read_csv("../input/tabular-playground-series-jan-2022/sample_submission.csv")

# rounding
preds = selective_rounding(test_preds, lower=0.3, upper=0.7)

submission.num_sold = preds
submission.to_csv("submission.csv", index=False)

In [None]:
submission.head()

### If you liked the notebook, consider to upvote, thank you and happy Kaggling!