![](https://centralnicregistry.com/assets/img/blog/graph.png)

# Tabular Playground Series - Jan 2022
## A Statistical Analysis

**Objective**: 
> Choose the best store chains selling Kaggle merchandise

**Data**:  
* train.csv - it includes the sales data for each date-country-store-item combination
* test.csv - the task is to predict the corresponding item sales for each date-country-store-item combination.
* sample_submission.csv - a sample submission file in the correct format

**Evaluation**: 
> Submissions are evaluated on SMAPE between forecasts and actual values. 
> SMAPE belongs to the family of percentage errors, for this reason is unit-free an pretty useful when comparing forecast performances between data sets. Symmetric MAPE (sMAPE) was proposed by [Armstrong (1978, p. 348)](https://econpapers.repec.org/article/eeeintfor/v_3a2_3ay_3a1986_3ai_3a3_3ap_3a387-390.htm) in order to overcome the disadvantage that they put a heavier penalty on negative errors than on positive errors. (See [Evaluating point forecast accuracy](https://otexts.com/fpp3/accuracy.html))

## Importing libraries and data preparation


In [None]:
# Data exploration
import numpy as np
import math
import pandas as pd
import datetime as dt
import scipy

# Time Series Features
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf

# Time Series Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Additional informations
import time
import holidays
import dateutil.easter as easter

# Patching sklearn
!pip install scikit-learn-intelex
from sklearnex import patch_sklearn
patch_sklearn()

# Model Building 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

# XGBoost
from xgboost import XGBRegressor

# Catboost
import catboost
from catboost import CatBoostRegressor, Pool
import shap

# Lgbm
from lightgbm import LGBMRegressor
import lightgbm as lgb

# Settings
sns.set(rc={'figure.figsize':(26, 8)})
custom_params = {"axes.spines.right": False, "axes.spines.top": False}
sns.set_theme(style="ticks", rc=custom_params)

In [None]:
train_data = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv', parse_dates=True)
test_data = pd.read_csv("../input/tabular-playground-series-jan-2022/test.csv", index_col=0, parse_dates=True)

print(f'Train Data rows: {train_data.shape[0]} \nTrain Data Columns: {train_data.shape[1]}\n')
print(f'Test Data rows: {test_data.shape[0]} \nTest Data Columns: {test_data.shape[1]}')

# Date formatting
train_data['date'], test_data['date'] = (pd.to_datetime(train_data['date']), 
                                         pd.to_datetime(test_data['date']))

print('\n','-'*22, sep='')
print(f'\nTrain Data start date:', train_data['date'].min().strftime('%B %d, %Y'), 
      '\nTrain Data end date: ', train_data['date'].max().strftime('%B %d, %Y'), '\n')

print(f'Test Data start date:', test_data['date'].min().strftime('%B %d, %Y'), 
      '\nTest Data end date: ', test_data['date'].max().strftime('%B %d, %Y'))

In [None]:
# Creating support lists

def check_consistency():
    train_country_list, test_country_list = train_data['country'].unique(), test_data['country'].unique()
    train_store_list, test_store_list = train_data['store'].unique(), test_data['store'].unique() 
    train_product_list, test_product_list = train_data['product'].unique(), test_data['product'].unique()
    
    result = []
    
    if train_country_list.all() == test_country_list.all():
        print("Consistency test 1, countries: passed")
        result.append(train_country_list)
    else: 
        print("Consistency test 1, countries: failed")
        
    if train_store_list.all() == test_store_list.all():
        print("Consistency test 2, stores: passed")
        result.append(train_store_list)
    else: 
        print("Consistency test 2, stores: failed")
        
    if train_product_list.all() == test_product_list.all():
        print("Consistency test 3, products: passed")
        result.append(train_product_list)
    else: 
        print("Consistency test 3, products: failed")
    
    return result

train_country_list, train_store_list, train_product_list = check_consistency()

print('\nCountries: ', train_country_list, '\n',
      'Stores: ', train_store_list, '\n',
      'Products: ', train_product_list, sep='')

In [None]:
# Checking NaNs

def check_na(train_data, test_data):
    if train_data.isnull().values.any() == False:
        print('No missing values detected in Train Data')
    else: 
        print('Missing values detected in Train Data')
    
    if test_data.isnull().values.any() == False:
        print('No missing values detected in Test Data')
    else: 
        print('Missing values detected in Test Data')
    
check_na(train_data, test_data)

In [None]:
# Adding holidays info
def add_holidays(train_data):
    finland_holidays = holidays.CountryHoliday('FI', years = list(range(2015, 2020)))
    norway_holidays  = holidays.CountryHoliday('NO', years = list(range(2015, 2020)))
    sweden_holidays  = holidays.CountryHoliday('SE', years = list(range(2015, 2020)))

    holiday_d = finland_holidays.copy()
    holiday_d.update(norway_holidays)
    holiday_d.update(sweden_holidays)

    train_data['name'] = train_data['date'].map(holiday_d)
    train_data['holiday'] = np.where(train_data['name'].isnull(), 0, 1)
    train_data.drop(['name', 'row_id'], axis = 1, inplace = True)
    
    test_data['name'] = test_data['date'].map(holiday_d)
    test_data['holiday'] = np.where(test_data['name'].isnull(), 0, 1)
    test_data.drop(['name'], axis = 1, inplace = True)
    
    print('Holidays added for years 2015-2018, Train Data \nHolidays added for year 2019, Test Data')
    
    return train_data, test_data

train_data, test_data = add_holidays(train_data)

In [None]:
# Thanks to @AmbrosM for the hints

def engineer_more(df):
    """Return a new dataframe with more engineered features"""
    new_df = df
    # End of year
    new_df = pd.concat([new_df,
                        pd.DataFrame({f"dec{d}":
                                      (df.date.dt.month == 12) & (df.date.dt.day == d)
                                      for d in range(24, 32)}),
                        pd.DataFrame({f"n-dec{d}":
                                      (df.date.dt.month == 12) & (df.date.dt.day == d) & (df.country == 'Norway')
                                      for d in range(24, 32)}),
                        pd.DataFrame({f"f-jan{d}":
                                      (df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Finland')
                                      for d in range(1, 14)}),
                        pd.DataFrame({f"jan{d}":
                                      (df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Norway')
                                      for d in range(1, 10)}),
                        pd.DataFrame({f"s-jan{d}":
                                      (df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Sweden')
                                      for d in range(1, 15)})],
                       axis=1)
    
    # May
    new_df = pd.concat([new_df,
                        pd.DataFrame({f"may{d}":
                                      (df.date.dt.month == 5) & (df.date.dt.day == d) 
                                      for d in list(range(1, 10))}), #  + list(range(17, 25))
                        pd.DataFrame({f"may{d}":
                                      (df.date.dt.month == 5) & (df.date.dt.day == d) & (df.country == 'Norway')
                                      for d in list(range(19, 26))})],
                       axis=1)
    
    # June and July
    new_df = pd.concat([new_df,
                        pd.DataFrame({f"june{d}":
                                      (df.date.dt.month == 6) & (df.date.dt.day == d) & (df.country == 'Sweden')
                                      for d in list(range(8, 14))}),
                        #pd.DataFrame({f"june{d}":
                        #              (df.date.dt.month == 6) & (df.date.dt.day == d) & (df.country == 'Norway')
                        #              for d in list(range(22, 31))}),
                        #pd.DataFrame({f"july{d}":
                        #              (df.date.dt.month == 7) & (df.date.dt.day == d) & (df.country == 'Norway')
                        #              for d in list(range(1, 3))})],
                       ],
                       axis=1)
    
    # Last Wednesday of June
    wed_june_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-06-24')),
                                         2016: pd.Timestamp(('2016-06-29')),
                                         2017: pd.Timestamp(('2017-06-28')),
                                         2018: pd.Timestamp(('2018-06-27')),
                                         2019: pd.Timestamp(('2019-06-26'))})
    new_df = pd.concat([new_df,
                        pd.DataFrame({f"wed_june{d}": 
                                      (df.date - wed_june_date == np.timedelta64(d, "D")) & (df.country != 'Norway')
                                      for d in list(range(-4, 6))})],
                       axis=1)
    
    # First Sunday of November
    sun_nov_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-11-1')),
                                         2016: pd.Timestamp(('2016-11-6')),
                                         2017: pd.Timestamp(('2017-11-5')),
                                         2018: pd.Timestamp(('2018-11-4')),
                                         2019: pd.Timestamp(('2019-11-3'))})
    new_df = pd.concat([new_df,
                        pd.DataFrame({f"sun_nov{d}": 
                                      (df.date - sun_nov_date == np.timedelta64(d, "D")) & (df.country != 'Norway')
                                      for d in list(range(0, 9))})],
                       axis=1)
    
    # First half of December (Independence Day of Finland, 6th of December)
    new_df = pd.concat([new_df,
                        pd.DataFrame({f"dec{d}":
                                      (df.date.dt.month == 12) & (df.date.dt.day == d) & (df.country == 'Finland')
                                      for d in list(range(6, 14))})],
                       axis=1)

    # Easter
    easter_date = df.date.apply(lambda date: pd.Timestamp(easter.easter(date.year)))
    new_df = pd.concat([new_df,
                        pd.DataFrame({f"easter{d}": 
                                      (df.date - easter_date == np.timedelta64(d, "D"))
                                      for d in list(range(-2, 11)) + list(range(40, 48)) + list(range(50, 59))})],
                       axis=1)
    
    return new_df.astype(np.float32, errors='ignore')


train_data, test_data = engineer_more(train_data), engineer_more(test_data)

In [None]:
# adding GDP 
gdp_exponent = 1.2121103201489674 # see https://www.kaggle.com/ambrosm/tpsjan22-03-linear-model for an explanation
gdp_df = pd.read_csv('../input/gdp-20152019-finland-norway-and-sweden/GDP_data_2015_to_2019_Finland_Norway_Sweden.csv')
gdp_df.set_index('year', inplace=True)

def get_gdp(row):
    country = 'GDP_' + row.country
    return gdp_df.loc[row.date.year, country]

train_data['gdp'] = np.log1p(train_data.apply(get_gdp, axis=1))
test_data['gdp'] = np.log1p(test_data.apply(get_gdp, axis=1))

In [None]:
# From https://www.kaggle.com/ambrosm/tpsjan22-03-linear-model#Simple-feature-engineering-(without-holidays)
def FourierFeatures(df):
    # temporary one hot encoding
    for product in ['Kaggle Mug', 'Kaggle Hat']:
        df[product] = df['product'] == product
    
    # The three products have different seasonal patterns
    dayofyear = df.date.dt.dayofyear
    for k in range(1, 3):
        df[f'sin{k}'] = np.sin(dayofyear / 365 * 2 * math.pi * k)
        df[f'cos{k}'] = np.cos(dayofyear / 365 * 2 * math.pi * k)
        df[f'mug_sin{k}'] = df[f'sin{k}'] * df['Kaggle Mug']
        df[f'mug_cos{k}'] = df[f'cos{k}'] * df['Kaggle Mug']
        df[f'hat_sin{k}'] = df[f'sin{k}'] * df['Kaggle Hat']
        df[f'hat_cos{k}'] = df[f'cos{k}'] * df['Kaggle Hat']
        df=df.drop([f'sin{k}', f'cos{k}'], axis=1)
    
    # drop temporary one hot encoding
    df=df.drop(['Kaggle Mug','Kaggle Hat'], axis=1)
    
    return df

# add fourier features
train_data=FourierFeatures(train_data)
test_data=FourierFeatures(test_data)

# Time Series Visualization

In [None]:
# Time series plotting
def plotting_ts(country = 'Finland', store = 'KaggleMart', product = 'Kaggle Mug'):

    figure = sns.lineplot(data = train_data[(train_data['country'] == country) &
                         (train_data['store'] == store) & 
                         (train_data['product'] == product)], 
                          y = 'num_sold', 
                          x = 'date').set_title(f'{product} in {store}, {country}')
    plt.show()
    

# Time Series Decomposition
def decomposition_ts(country = 'Finland', store = 'KaggleMart', product = 'Kaggle Mug'):
    
    table = train_data[(train_data['country'] == country) &
                         (train_data['store'] == store) & 
                         (train_data['product'] == product)][['num_sold', 'date']]
    table = table.set_index('date')

    plt.rc('font',size=15)
    result = seasonal_decompose(table, model='multiplicative')
    print('\nTime Series Decomposition\n')
    result.plot()
    
    return result


# Auto Correlation Function (ACF) plot
def acf_ts(country = 'Finland', store = 'KaggleMart', product = 'Kaggle Mug'):
    
    table = train_data[(train_data['country'] == country) &
                         (train_data['store'] == store) & 
                         (train_data['product'] == product)][['num_sold', 'date']]
    table = table.set_index('date')

    plt.rc('font',size=15)
    plot_acf(table)
    print('\nAuto Correlation Function plot\n')
    plt.show()   

    
plotting_ts()
acf_ts()
result = decomposition_ts()

## Notes

1. Regarding the decomposition plot, this is a simple decomposition. You should also evaluate if an additive or a multiplicative decomposition fits your needs. 

> The additive decomposition is the most appropriate if the magnitude of the seasonal fluctuations, or the variation around the trend-cycle, does not vary with the level of the time series. When the variation in the seasonal pattern, or the variation around the trend-cycle, appears to be proportional to the level of the time series, then a multiplicative decomposition is more appropriate.   
>  
> Additive decomposition: 
> $$y_{t} = S_{t} + T_{t} + R_t$$
> Multiplicative decomposition: 
> $$y_{t} = S_{t} \times T_{t} \times R_t$$
>  
> The results are obtained by first estimating the trend by applying a convolution filter to the data. The trend is then removed from the series and the average of this de-trended series for each period is the returned seasonal component.

2. Regarding the ACF plot, it can be interpretated as follows:

> * When data have a trend, the autocorrelations for small lags tend to be large and positive because observations nearby in time are also nearby in value. So the ACF of a trended time series tends to have positive values that slowly decrease as the lags increase.
> * When data are seasonal, the autocorrelations will be larger for the seasonal lags (at multiples of the seasonal period) than for other lags.
> * When data are both trended and seasonal, you see a combination of these effects. 
>
> In this example you can notice a combination of trend and seasonality. There is a positive trend since we have values that slowly decrease and seasonality since  the autocorrelation tends to be larger for the seasonal lags.

3. Residuals (random component) analysis should be performed in order to spot possible inconsistent behaviors.

> In this example the total number of 'Kaggle Mug' sold spikes each January and suddenly decreases. This behavior should be investigated in order to fully understand the time series. See 'Detecting Outliers: Point outliers'.


## Further Example

In [None]:
# Country selection
country = 'Finland'

# Store selection
store = 'KaggleRama'

# Product selection
product = 'Kaggle Sticker'

plotting_ts(country, store, product)
acf_ts(country, store, product)
result = decomposition_ts(country, store, product)

## Detecting Outliers: Point Outliers

> A point outlier behaves unusually in a specific time instance when compared to other values in the time series (global outlier), or to its neighborhood (local outlier)

In [None]:
x = result.resid.index
y = result.resid.values

sns.lineplot(y = y, x = x).set_title(f'Possible outliers in {product} - {store}, {country}')
plt.text('2016-01', 1.34, "Possible Outlier", horizontalalignment='left', size='medium', color='Red', weight='semibold')
plt.text('2016-01', 0.75, "Possible Outlier", horizontalalignment='left', size='medium', color='Red', weight='semibold')
plt.text('2017-01', 1.25, "Possible Outlier", horizontalalignment='left', size='medium', color='Red', weight='semibold')
plt.text('2017-01', 0.78, "Possible Outlier", horizontalalignment='left', size='medium', color='Red', weight='semibold')
plt.text('2019-01', 0.77, "Possible Outlier", horizontalalignment='right',size='medium', color='Red', weight='semibold')

plt.show()

## Notes

In this specific case, we are dealing with sales. It may be that due to high volumes of transactions in December several invoices are registered in January.

## Time Series Comparison and Further Visualizations

In [None]:
# Total Sales
tot_comp_tab = train_data.groupby(['date', 'store']).sum()
sns.lineplot(data = tot_comp_tab, 
             x = 'date', 
             y = 'num_sold', 
             hue = 'store').set_title('KaggleMart VS KaggleRama Total Sales')
plt.show()

## Comparison by Country

In [None]:
fig, axes = plt.subplots(3, 1, figsize=(26,18))
fig.suptitle('Country Comparison')

# Finland
fin_comp_tab = train_data.groupby(['date', 'country', 'store']).sum().reset_index()
fin_comp_tab = fin_comp_tab[fin_comp_tab['country']=='Finland']

# Norway
nor_comp_tab = train_data.groupby(['date', 'country', 'store']).sum().reset_index()
nor_comp_tab = nor_comp_tab[nor_comp_tab['country']=='Norway']

# Sweden
swe_comp_tab = train_data.groupby(['date', 'country', 'store']).sum().reset_index()
swe_comp_tab = swe_comp_tab[swe_comp_tab['country']=='Sweden']

sns.lineplot(data = fin_comp_tab, 
             ax=axes[0],
             x = 'date', 
             y = 'num_sold', 
             hue = 'store')
axes[0].set_title('KaggleMart VS KaggleRama Total Sales - Finland')

sns.lineplot(data = nor_comp_tab, 
             ax=axes[1], 
             x = 'date', 
             y = 'num_sold', 
             hue = 'store')
axes[1].set_title('KaggleMart VS KaggleRama Total Sales - Norway')

sns.lineplot(data = fin_comp_tab, 
             ax=axes[2],  
             x = 'date', 
             y = 'num_sold', 
             hue = 'store')
axes[2].set_title('KaggleMart VS KaggleRama Total Sales - Sweden')

plt.show()

## Comparison by Product

In [None]:
fig, axes = plt.subplots(3, 1, figsize=(26,18))
fig.suptitle('Product Comparison')
    
# Kaggle Mug
mug_comp_tab = train_data.groupby(['date', 'product', 'store']).sum().reset_index()
mug_comp_tab = mug_comp_tab[mug_comp_tab['product']=='Kaggle Mug']

# Kaggle Hat
hat_comp_tab = train_data.groupby(['date', 'product', 'store']).sum().reset_index()
hat_comp_tab = hat_comp_tab[hat_comp_tab['product']=='Kaggle Hat']

# Kaggle Sticker
sticker_comp_tab = train_data.groupby(['date', 'product', 'store']).sum().reset_index()
sticker_comp_tab = sticker_comp_tab[sticker_comp_tab['product']=='Kaggle Sticker']

sns.lineplot(data = mug_comp_tab, 
             ax=axes[0],
             x = 'date', 
             y = 'num_sold', 
             hue = 'store')
axes[0].set_title('KaggleMart VS KaggleRama Total Sales - Kaggle Mug')

sns.lineplot(data = hat_comp_tab, 
             ax=axes[1], 
             x = 'date', 
             y = 'num_sold', 
             hue = 'store')
axes[1].set_title('KaggleMart VS KaggleRama Total Sales - Kaggle Hat')

sns.lineplot(data = sticker_comp_tab, 
             ax=axes[2],  
             x = 'date', 
             y = 'num_sold', 
             hue = 'store')
axes[2].set_title('KaggleMart VS KaggleRama Total Sales - Kaggle Sticker')

plt.show()

## Notes

From this preliminar analysis we can already notice that KaggleRama overall performances are betther than KaggleMart performances, both for countries and products. These results suggests that also in 2019 KaggleRama should outperform KaggleMart. In the next section these results are formalized in a mathematical model.

# Models fitting

In [None]:
# Features Encoding
def var_encoding(train, test):
    dummy = ['country', 'store', 'product']

    train = pd.get_dummies(train, columns=dummy)
    test = pd.get_dummies(test, columns=dummy)
    
    return train, test

train_data, test_data = var_encoding(train_data, test_data)

In [None]:
def further_features(train, test):
    train['year'], test['year'] = train['date'].dt.year, test['date'].dt.year
    train['quarter'], test['quarter'] = train['date'].dt.quarter, test['date'].dt.quarter
    train['month'], test['month'] = train['date'].dt.month, test['date'].dt.month 
    train['week'], test['week'] = train['date'].dt.isocalendar().week, test['date'].dt.isocalendar().week 
    train['day'], test['day'] = train['date'].dt.day, test['date'].dt.day  
    train['weekday'], test['weekday'] = train['date'].dt.weekday, test['date'].dt.weekday
    train['dayofyear'], test['dayofyear'] = train['date'].dt.dayofyear, test['date'].dt.dayofyear
    train['is_weekend'] = np.where((train['weekday'] == 5) | (train['weekday'] == 6), 1, 0)
    test['is_weekend']  = np.where((test['weekday'] == 5) | (test['weekday'] == 6), 1, 0)
    
    return train, test

train_data, test_data = further_features(train_data, test_data)

In [None]:
# Train test spliting
train_set = train_data[train_data['date'] <= '2017-12-31'].drop(['num_sold'], axis = 1)
y_train = train_data[train_data['date'] <= '2017-12-31'].loc[:,'num_sold']

validation_set = train_data[train_data['date'] > '2017-12-31'].drop(['num_sold'], axis = 1)
y_val = train_data[train_data['date'] > '2017-12-31'].loc[:,'num_sold']

test_set = test_data

# Date adjustment
train_set.drop('date', axis = 1, inplace = True)
validation_set.drop('date', axis = 1, inplace = True)
test_set.drop('date', axis = 1, inplace = True)

In [None]:
# Data Standardization
def scaling_feat(train_set, validation_set, test_set):
    print(f'Dimensions before scaling: \ntrain_set: {train_set.shape} \nvalidation_set: {validation_set.shape} \ntest_set: {test_set.shape}')
    
    scaler = StandardScaler()

    train_set_scaled = scaler.fit_transform(train_set)
    validation_set_scaled = scaler.transform(validation_set)
    test_set_scaled = scaler.transform(test_set)

    train_set = pd.DataFrame(train_set_scaled, index=train_set.index, columns=train_set.columns)
    validation_set = pd.DataFrame(validation_set_scaled, index=validation_set.index, columns=validation_set.columns)
    test_set = pd.DataFrame(test_set_scaled, index=test_set.index, columns=test_set.columns)
    
    print(f'\nDimensions after scaling: \ntrain_set: {train_set.shape} \nvalidation_set: {validation_set.shape} \ntest_set: {test_set.shape}')
    
    return train_set, validation_set, test_set

train_set, validation_set, test_set = scaling_feat(train_set, validation_set, test_set)

In [None]:
# Creating the Score Metric
def SMAPE(y_true, y_pred):
    den = (y_true + np.abs(y_pred)) / 200.0
    d = np.abs(y_true - y_pred) / den
    d[den == 0] = 0.0
    return np.mean(d)

SMAPE_score = make_scorer(SMAPE, greater_is_better = False)

# Random Forest Regressor

In [None]:
# Defining the model
#model = RandomForestRegressor()
#param_search = { 
#    'n_estimators': [10, 20, 50, 100, 150],
#    'max_features': ['auto', 'sqrt', 'log2'],
#    'max_depth' : [i for i in range(5,10)]
#}
#
#
#tscv = TimeSeriesSplit(n_splits=12)
#gsearch = GridSearchCV(estimator=model, cv=tscv, param_grid=param_search, scoring = SMAPE_score)
#
#gsearch.fit(train_set, y_train)
#
#best_score = gsearch.best_score_
#best_model = gsearch.best_estimator_
#
#print('Lowest Training SMAPE: ', round(best_score,2))

In [None]:
#y_true = np.array(y_val.values)
#y_pred = best_model.predict(validation_set)
#
#print('Validation SMAPE: ', round(SMAPE(y_true, y_pred), 2))

### Visual Comparison

In [None]:
#d = {'Actual' : pd.Series(y_true), 'Forecasted' : pd.Series(y_pred)}
#comparison_df = pd.DataFrame(d)
#
#sns.scatterplot(data = comparison_df, 
#             x = comparison_df.index, 
#             y = 'Forecasted',
#             color = 'darkred')
#sns.lineplot(data = comparison_df, 
#             x = comparison_df.index, 
#             y = 'Actual',
#             alpha = 0.8)
#
#plt.show()

In [None]:
# Final prediction
#y_pred = best_model.predict(test_set)
#
# CSV Submission
#submission = pd.DataFrame(test_data.index)
#submission['num_sold'] = y_pred
#submission['num_sold'] = np.round(submission['num_sold']).astype(int) 
#submission.to_csv('submission.csv', index=False)

# LGBM

In [None]:
complete_series = train_set.append(validation_set)
complete_y = y_train.append(y_val)

complete_series = complete_series.astype('float64', errors = 'ignore')
complete_y = complete_y.astype('float64', errors = 'ignore')

In [None]:
# SMAPE: Symmetric mean absolute percentage error (adjusted MAPE)
#def smape(preds, target):
#    n = len(preds)
#    masked_arr = ~((preds == 0) & (target == 0))
#    preds, target = preds[masked_arr], target[masked_arr]
#    num = np.abs(preds-target)
#    denom = np.abs(preds)+np.abs(target)
#    smape_val = (200*np.sum(num/denom))/n
#    return smape_val
#
#def lgbm_smape(y_true, y_pred):
#    smape_val = smape(y_true, y_pred)
#    return 'SMAPE', smape_val, False

In [None]:
#lgbm_params = { "num_leaves":[20, 31, 40],
#                "max_depth":[-1, 20, 30, 40],
#                "learning_rate":[0.125, 0.1, 0.05], 
#                "n_estimators":[10000, 15000, 50000],
#                "min_split_gain":[0.0, 2,5], 
#                "min_child_samples":[5, 10, 20, 30], 
#                "colsample_bytree":[0.5, 0.8, 1.0], 
#                "reg_alpha":[0.0, 0.5, 1, 1.5], 
#                "reg_lambda":[0.0, 0.5, 1]}

#model = lgb.LGBMRegressor(random_state=1505)

#tscv = TimeSeriesSplit(n_splits=3)

#rsearch = RandomizedSearchCV(model, 
#                             lgbm_params, 
#                             random_state=1505, 
#                             cv=tscv, 
#                             scoring=make_scorer(smape), 
#                             verbose = True, 
#                             n_jobs = -1).fit(complete_series, complete_y)

#print(rsearch.best_params_)

In [None]:
#model_tuned = lgb.LGBMRegressor(**rsearch.best_params_, random_state=1505).fit(train_set, y_train)
#
#print("TRAIN SMAPE:", smape(y_train, model_tuned.predict(train_set)))
#
#print("VALID SMAPE:", smape(y_val, model_tuned.predict(validation_set)))

In [None]:
#model_tuned = lgb.LGBMRegressor(**rsearch.best_params_, random_state=1505, metric = "custom")
#              
#model_tuned.fit(
#    complete_series, complete_y,
#    eval_metric= lambda y_true, y_pred: [lgbm_smape(y_true, y_pred)],
#    eval_set = [(train_set, y_train), (validation_set, y_val)],
#    eval_names = ["Train", "Valid"],
#    callbacks = [lgb.log_evaluation(1000), 
#                 lgb.early_stopping(2500)])
#
#print("Best Iteration:", model_tuned.booster_.best_iteration)

In [None]:
#def geometric_round(arr):
#    result_array = arr
#    result_array = np.where(result_array < np.sqrt(np.floor(arr)*np.ceil(arr)), np.floor(arr), result_array)
#    result_array = np.where(result_array >= np.sqrt(np.floor(arr)*np.ceil(arr)), np.ceil(arr), result_array)
#    return result_array
#
#y_true = np.array(complete_y.values)
#y_pred = geometric_round(model_tuned.predict(complete_series))
#
#print('Validation SMAPE: ', round(SMAPE(y_true, y_pred), 2))

In [None]:
#d = {'Actual' : pd.Series(y_true), 'Forecasted' : pd.Series(y_pred)}
#comparison_df = pd.DataFrame(d)
#
#sns.scatterplot(data = comparison_df, 
#             x = comparison_df.index, 
#             y = 'Forecasted',
#             color = 'darkred')
#sns.lineplot(data = comparison_df, 
#             x = comparison_df.index, 
#             y = 'Actual',
#             alpha = 0.8)
#
#plt.show()

In [None]:
# Make predictions
#test_set = test_set.astype('float64', errors = 'ignore')
#preds_test = geometric_round(model_tuned.predict(test_set))

# Save predictions to file
#output = pd.DataFrame({'row_id': test_set.index,
#                       'num_sold': preds_test})

# Check format
#output.head()

In [None]:
#output.to_csv('submission.csv', index=False)

# Catboost

In [None]:
#from sklearn.model_selection import KFold
#from sklearn.metrics import mean_squared_error
#import optuna

In [None]:
#
#N_split = 5
#kf = KFold(n_splits=N_split)
#
#def objective(trial):
#    params = {
#                'device_config' : 'GPU',
#                'eval_metric': 'SMAPE', 
#                'use_best_model': True,
#                'random_seed' : 1,
#                'learning_rate' :trial.suggest_loguniform('learning_rate', 0.01, 0.3),
#                "depth": trial.suggest_int("depth", 1, 15),
#                'l2_leaf_reg' :trial.suggest_loguniform('l2_leaf_reg', 1e-8, 20),
#                'random_strength' : trial.suggest_loguniform('random_strength', 1, 50),
#                'grow_policy':trial.suggest_categorical ('grow_policy', ['Lossguide','SymmetricTree']),
#                'max_bin': trial.suggest_int("max_bin", 20, 500),
#                'min_data_in_leaf':trial.suggest_int('min_data_in_leaf', 1, 100),
#                'bootstrap_type': trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli"]),
#                'logging_level' : 'Silent'
#            }
#    
#    if params['grow_policy'] == 'Lossguide':
#        params['max_leaves']:trial.suggest_int('max_leaves', 1, 100)
#    if params["bootstrap_type"] == "Bayesian":
#        params["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
#    elif params["bootstrap_type"] == "Bernoulli":
#        params["subsample"] = trial.suggest_float("subsample", 0.1, 1)
#        
#    
#    score_list = []
#    
#    for fold, (train_idx, val_idx) in enumerate(kf.split(train_set, y_train)):
#        X_tr = train_set.loc[train_idx]
#        X_va = train_set.iloc[val_idx]
#        
#        # Preprocess the data
#        X_tr_f = X_tr 
#        y_tr = y_train.loc[train_idx].values
#
#        X_va_f = X_va
#        y_va = y_train.loc[val_idx].values
#
#        # Train the model
#        model = CatBoostRegressor(**params) 
#        model.fit(        
#                X_tr_f,
#                y_tr,
#                eval_set =[( X_va_f,y_va)],
#                verbose =0,
#                early_stopping_rounds = 200)
#
#        # Predictions
#        y_va_pred = model.predict(X_va_f)
#        score = mean_squared_error(y_va, y_va_pred,squared = True)
#        score_list.append(score)
#        
#    return sum(score_list) / len(score_list)
#
#study = optuna.create_study(direction='minimize')
#study.optimize(objective, n_trials=20)

In [None]:
#cat_params = {  
#        'eval_metric': 'SMAPE',
#        'logging_level' : 'Silent',
#        'random_state' : 1518,
#        'device_config' : 'GPU',
#        'early_stopping_rounds' : 200}
#
#model = CatBoostRegressor(**cat_params) 
#grid = {'learning_rate': [0.1, 0.05, 0.025, 0.005],
#        'depth': [4, 6, 8],
#        'l2_leaf_reg': [0.5, 1, 3, 5]}
#
#gscv = GridSearchCV(estimator = model, param_grid = grid, scoring = SMAPE_score, cv = 5, n_jobs=-1, verbose = 1)
#gscv.fit(train_set, y_train)
#
#best_score = gscv.best_score_
#best_model = gscv.best_estimator_
#
#print('Lowest Training SMAPE: ', round(best_score,2))

In [None]:
#best_model = CatBoostRegressor(eval_metric =  'SMAPE', 
#                               logging_level = 'Silent', 
#                               device_config = 'GPU', 
#                               random_seed = 1)
#
#best_model.set_params(**study.best_params)
#best_model.fit(train_set, y_train)

In [None]:
#y_true = np.array(y_val.values)
#y_pred = best_model.predict(validation_set)
#
#print('Validation SMAPE: ', round(SMAPE(y_true, y_pred), 2))

### Visual Comparison

In [None]:
#d = {'Actual' : pd.Series(y_true), 'Forecasted' : pd.Series(y_pred)}
#comparison_df = pd.DataFrame(d)
#
#sns.scatterplot(data = comparison_df, 
#             x = comparison_df.index, 
#             y = 'Forecasted',
#             color = 'darkred')
#sns.lineplot(data = comparison_df, 
#             x = comparison_df.index, 
#             y = 'Actual',
#             alpha = 0.8)
#
#plt.show()

In [None]:
#shap_values = best_model.get_feature_importance(Pool(validation_set, 
#                                                label = y_val), 
#                                                type="ShapValues")
#expected_value = shap_values[0,-1]
#shap_values = shap_values[:,:-1]
#
#shap.initjs()
#shap.force_plot(expected_value, shap_values[3,:], test_set.iloc[3,:])


In [None]:
#shap.summary_plot(shap_values, test_set)

In [None]:
## Final prediction catboost
#y_pred = best_model.predict(test_set)
#
## CSV Submission
#submission = pd.DataFrame(test_data.index)
#submission['num_sold'] = y_pred
#submission['num_sold'] = np.round(submission['num_sold']).astype(int) 
#submission.to_csv('submission.csv', index=False)

# Hybrid Model

In [None]:
class HybridModel:
    def __init__(self, model_1, model_2, grid=None):
        self.model_1 = model_1
        self.model_2 = model_2
        self.grid=grid
        
    def fit(self, X_train_1, X_train_2, y):
        # Train model 1
        self.model_1.fit(X_train_1, y)
        
        # Predictions from model 1 (trend)
        y_trend = self.model_1.predict(X_train_1)

        if self.grid:
            # Grid search
            tscv = TimeSeriesSplit(n_splits=3)
            grid_model = GridSearchCV(estimator=self.model_2, cv=tscv, param_grid=self.grid)
        
            # Train model 2 on detrended series
            grid_model.fit(X_train_2, y-y_trend)
            
            # Model 2 preditions (for residual analysis)
            y_resid = grid_model.predict(X_train_2)
            
            # Save model
            self.grid_model=grid_model
        else:
            # Train model 2 on residuals
            self.model_2.fit(X_train_2, y-y_trend)
            
            # Model 2 preditions (for residual analysis)
            y_resid = self.model_2.predict(X_train_2)
        
        # Save data
        self.y_train_trend = y_trend
        self.y_train_resid = y_resid
        
    def predict(self, X_test_1, X_test_2):
        # Predict trend using model 1
        y_trend = self.model_1.predict(X_test_1)
        
        if self.grid:
            # Grid model predictions
            y_resid = self.grid_model.predict(X_test_2)
        else:
            # Model 2 predictions
            y_resid = self.model_2.predict(X_test_2)
        
        # Add predictions together
        y_pred = y_trend + y_resid
        
        # Save data
        self.y_test_trend = y_trend
        self.y_test_resid = y_resid
        
        return y_pred

In [None]:
# Choose models
model_1=LinearRegression()
models_2=[LGBMRegressor(random_state=1505), CatBoostRegressor(random_state=0, verbose=False), XGBRegressor(random_state=1505)]

# Parameter grid
param_grid = {'n_estimators': [100, 150, 200, 225, 250, 275, 300],
        'max_depth': [4, 5, 6, 7],
        'learning_rate': [0.1, 0.12, 0.13, 0.14, 0.15]}

# Initialise output vectors
y_pred=np.zeros(len(test_data))
train_preds=np.zeros(len(complete_y))

# Ensemble predictions
for model_2 in models_2:
    # Start timer
    start = time.time()
    
    # Construct hybrid model
    model = HybridModel(model_1, model_2, grid=param_grid)

    # Train model
    model.fit(complete_series, complete_series, np.log(complete_y))

    # Save predictions
    y_pred += np.exp(model.predict(test_set,test_set))
    
    # Training set predictions (for residual analysis)
    train_preds += np.exp(model.y_train_trend+model.y_train_resid)
    
    # Stop timer
    stop = time.time()
    
    print(f'Model_2:{model_2} -- time:{round((stop-start)/60,2)} mins')
    
    if model.grid:
        print('Best parameters:',model.grid_model.best_params_,'\n')
    
# Scale
y_pred = y_pred/len(models_2)
train_preds = train_preds/len(models_2)

In [None]:
# From https://www.kaggle.com/fergusfindley/ensembling-and-rounding-techniques-comparison
def geometric_round(arr):
    result_array = arr
    result_array = np.where(result_array < np.sqrt(np.floor(arr)*np.ceil(arr)), np.floor(arr), result_array)
    result_array = np.where(result_array >= np.sqrt(np.floor(arr)*np.ceil(arr)), np.ceil(arr), result_array)
    return result_array

y_pred=geometric_round(y_pred)

# Save predictions to file
output = pd.DataFrame({'row_id': test_data.index, 'num_sold': y_pred})

# Check format
output.head()

In [None]:
# need to ensemble
train_preds = np.exp(model.y_train_trend + model.y_train_resid)

# Residuals on training set (SMAPE)
residuals = 200 * (train_preds - complete_y) / (train_preds + complete_y)

# Plot residuals
plt.figure(figsize = (12,4))
plt.scatter(np.arange(len(residuals)),residuals, s = 1)
plt.hlines([0], 0, residuals.index.max(), color='k')
plt.title('Residuals on training set')
plt.xlabel('Sample')
plt.ylabel('SMAPE')

In [None]:
mu, std = scipy.stats.norm.fit(residuals)

plt.figure(figsize=(12,4))
plt.hist(residuals, bins=100, density=True)
x = np.linspace(plt.xlim()[0], plt.xlim()[1], 200)
plt.plot(x, scipy.stats.norm.pdf(x, mu, std), 'r', linewidth=2)
plt.title(f'Histogram of residuals; mean = {residuals.mean():.4f}, '
          f'$\sigma = {residuals.std():.1f}$, SMAPE = {residuals.abs().mean():.5f}')
plt.xlabel('Residual (percent)')
plt.ylabel('Density')
plt.show()

In [None]:
output.to_csv('submission.csv', index=False)