In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install pycaret[full] --quiet

In [None]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

#ignore warnings
warnings.filterwarnings("ignore")

# import the regression module
from pycaret.regression import *

In [None]:
train = pd.read_csv("../input/tabular-playground-series-jan-2022/train.csv", index_col=0, parse_dates=True)
test = pd.read_csv("../input/tabular-playground-series-jan-2022/test.csv", index_col=0, parse_dates=True)

In [None]:
train['date'] = pd.to_datetime(train['date'])
test['date'] = pd.to_datetime(test['date'])

In [None]:
#creating a 12 month moving average
df = train.copy()
df['mov_avg'] = df['num_sold'].rolling(12).mean()

# plotting the data and moving average
import plotly.express as px
fig = px.line(df, x="date", y=["num_sold", "mov_avg"], template = 'plotly_dark')
fig.show()

## Creating some features

Refer: [this notebook](https://www.kaggle.com/maxencefzr/tps-jan22-eda-simple-catboost?scriptVersionId=84486229&cellId=26)

In [None]:
import holidays
holiday_list = []
holiday_dict = {}
for date, name in holidays.Finland(years=[2015, 2016, 2017, 2018, 2019]).items():
    holiday_list.append([date, name])
    
for date, name in holidays.Norway(years=[2015, 2016, 2017, 2018, 2019]).items():
    holiday_list.append([date, name])
    
for date, name in holidays.Sweden(years=[2015, 2016, 2017, 2018, 2019]).items():
    if name!='Söndag':
        holiday_list.append([date, name.replace(", Söndag", "")])
        

for i in range(len(holiday_list)):
    holiday_dict[holiday_list[i][0]] = holiday_list[i][1]

In [None]:
def create_features(df):
    df['day'] = df['date'].dt.day
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['DayOfYear'] = df['date'].dt.dayofyear
    df['weekday'] = df['date'].dt.weekday
    df['WeekOfYear'] = df['date'].dt.weekofyear
    df['quarter'] = df['date'].dt.quarter
#     df['IsLeapYear'] = df['date'].dt.is_leap_year
    df['weekend'] = (df['date'].dt.weekday>=5).astype(int)
    df['holiday_name'] = df['date'].map(holiday_dict)
    df['is_holiday'] = np.where(df['holiday_name'].notnull(), 1, 0)
    df['holiday_name'] = df['holiday_name'].fillna("No holiday")
    df['DayOfMonth'] = df['date'].dt.days_in_month
    df['daysinmonth'] = df['date'].dt.days_in_month
    df.drop(columns=['date'], inplace=True)
    
create_features(train)
create_features(test)

Refer: [this discussion](https://www.kaggle.com/c/web-traffic-time-series-forecasting/discussion/36414)

In [None]:
def SMAPE(y_true, y_pred):
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)

## Using Pycaret

Refer: [this notebook](https://www.kaggle.com/bernhardklinger/tps-jan-2022/notebook)

In [None]:
# initialize setup
reg = setup(data = train,
            target = 'num_sold',
            normalize=True,
            normalize_method='robust',
            transform_target = True,
            data_split_shuffle = False, #so that we do not use "future" observations to predict "past" observations
            create_clusters = False,
            use_gpu = True,
            silent = True,
            fold=10,
            n_jobs = -1)

In [None]:
#see available models
models()

In [None]:
# add SMAPE to PyCaret
add_metric('SMAPE', 'SMAPE', SMAPE, greater_is_better=False)

In [None]:
best = compare_models(sort='SMAPE', n_select=3)

In [None]:
blend = blend_models(best)
predict_model(blend)

In [None]:
final_blend = finalize_model(blend)
predict_model(final_blend)

## Making predictions

In [None]:
predictions = predict_model(final_blend, data=test)

In [None]:
predictions

In [None]:
predictions.columns

In [None]:
predictions['Label']

In [None]:
submission = pd.DataFrame(list(zip(test.index,predictions.Label)),columns = ['row_id', 'num_sold'])

#The ground truth values are integers so rounding on average gets you closer to the actual values.
#For more info: refer https://www.kaggle.com/c/tabular-playground-series-jan-2022/discussion/299162
submission.num_sold = np.round(submission.num_sold).astype(int)
submission

In [None]:
submission.to_csv('./submission.csv', index=False)