This notebooks incorporates the improvements upon my previous work made by https://www.kaggle.com/bernhardklinger/tps-jan-2022/notebook - thanks for elaborating on my work!

In [None]:
%%capture
#suppress output and install the full pyCaret library
!pip install pycaret[full]

In [None]:
## Basic packages
import pandas as pd
import numpy as np

#pycaret
from pycaret.regression import *

In [None]:
#this is an aesthetic choice and just removes the many warnings that some functions and comands produce
#it helps significantly declutter the workbook
import warnings
warnings.filterwarnings('ignore')

In [None]:
#import data into pandas DataFrames
train = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv', index_col = 'row_id')
test = pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv', index_col = 'row_id')

In [None]:
#Credit to https://www.kaggle.com/jaredfeng/tps-jan22-inprog-v5

holiday_path = '../input/holidays-finland-norway-sweden-20152019/Holidays_Finland_Norway_Sweden_2015-2019.csv'

def GetHoliday(holiday_path, df):
    """
    Get a boolean feature of whether the current row is a holiday sale
    """
    
    holiday = pd.read_csv(holiday_path)
    fin_holiday = holiday.loc[holiday.Country == 'Finland']
    swe_holiday = holiday.loc[holiday.Country == 'Sweden']
    nor_holiday = holiday.loc[holiday.Country == 'Norway']
    df['fin holiday'] = df.date.isin(fin_holiday.Date).astype(int)
    df['swe holiday'] = df.date.isin(swe_holiday.Date).astype(int)
    df['nor holiday'] = df.date.isin(nor_holiday.Date).astype(int)
    
    df['holiday'] = np.zeros(df.shape[0]).astype(int)
    df.loc[df.country == 'Finland', 'holiday'] = df.loc[df.country == 'Finland', 'fin holiday']
    df.loc[df.country == 'Sweden', 'holiday'] = df.loc[df.country == 'Sweden', 'swe holiday']
    df.loc[df.country == 'Norway', 'holiday'] = df.loc[df.country == 'Norway', 'nor holiday']
    df.drop(['fin holiday', 'swe holiday', 'nor holiday'], axis=1, inplace=True)
    return df

train = GetHoliday(holiday_path, train)
test = GetHoliday(holiday_path, test)

In [None]:
# Credit to https://www.kaggle.com/ranjeetshrivastav/tps-jan-21-base-xgb
# and https://www.kaggle.com/bernhardklinger/tps-jan-2022/notebook

def feature_eng(df):
    df['date'] = pd.to_datetime(df['date'])
    df['week']= df['date'].dt.week
    df['year'] = 'Y' + df['date'].dt.year.astype(str)
    df['quarter'] = 'Q' + df['date'].dt.quarter.astype(str)
    df['day'] = df['date'].dt.day
    df['dayofyear'] = df['date'].dt.dayofyear
    df.loc[(df.date.dt.is_leap_year) & (df.dayofyear >= 60),'dayofyear'] -= 1
    df['weekend'] = df['date'].dt.weekday >=5
    df['weekday'] = 'WD' + df['date'].dt.weekday.astype(str)
    df.drop(columns=['date'],inplace=True)  

feature_eng(train)
feature_eng(test)

In [None]:
train.dtypes

In [None]:
# Credit to https://www.kaggle.com/c/web-traffic-time-series-forecasting/discussion/36414
def SMAPE(y_true, y_pred):
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)

In [None]:
#setting up the pyCaret regression algorithm
reg = setup(data = train,
            target = 'num_sold',
            train_size = 0.75, #75:25 train/validation split
            normalize = True, #normalisation helps some algorithms
            normalize_method = 'robust', #resilient to outliers
            transform_target = True, #applies transformation to target column
            data_split_shuffle = False, #so that we do not use "future" observations to predict "past" observations
            #create_clusters = True, #adds additional feature by assigning clusters
            feature_interaction = True, #new features are created by interacting (a * b) all the numeric variables in the dataset
            #use_gpu = True, #use GPU acceleration to train models
            silent = True, #removes need for confirmation step
            fold = 20, #number of cross-fold validation folds
            n_jobs = -1) #use all processor threads

In [None]:
#list all available models
models()

In [None]:
#adds the metric created previously to the pyCaret suite of metrics
add_metric('SMAPE', 'SMAPE', SMAPE, greater_is_better = False)

In [None]:
#compare_models(sort = 'MAPE')

In [None]:
#compares all models available and returns top N models to then be used
N = 3
top = compare_models(sort = 'SMAPE', n_select = N)

In [None]:
#Voting Classifier that blends predictions of individial models
#only uses training set and predicts on valiation set
blend = blend_models(top)
predict_model(blend);

In [None]:
# use test and validation to train model and predicts on validation set
final_blend = finalize_model(blend)
predict_model(final_blend);

In [None]:
#model stacking
#only uses training set and predicts on valiation set
#stack = stack_models(top)
#predict_model(stack);

In [None]:
# use test and validation to train model and predicts on validation set
#final_stack = finalize_model(stack)
#predict_model(final_stack);

In [None]:
#tuned_top = [tune_model(i, optimize = 'MAPE', choose_better = True) for i in top]

In [None]:
#tuned_blend = blend_models(tuned_top)
#predict_model(tuned_blend);

In [None]:
#final_tuned_blend = finalize_model(tuned_blend)
#predict_model(final_tuned_blend);

In [None]:
#create predictions for test observations

unseen_predictions_blend = predict_model(final_blend, data=test)
unseen_predictions_blend.head()

In [None]:
#creates, saves, and prints the submission csv file

assert(len(test.index)==len(unseen_predictions_blend))

sub = pd.DataFrame(list(zip(test.index, unseen_predictions_blend.Label)),columns = ['row_id', 'num_sold'])

sub.to_csv('submission.csv', index = False)

print(sub)