This notebook is the result of the work of https://www.kaggle.com/maxencefzr/tps-jan22-catboost-using-pycaret/notebook

In [None]:
%%capture
!pip install pycaret[full]

import os
import warnings

import numpy as np  # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import seaborn as sns
import matplotlib.pyplot as plt

import math
from pathlib import Path

import dateutil.easter as easter
from pycaret.regression import *

# Mute warnings
warnings.filterwarnings("ignore")

In [None]:
data_dir = Path('../input/tabular-playground-series-jan-2022')
holiday_dir = Path('../input/public-and-unofficial-holidays-nor-fin-swe-201519')
gdp_dir = Path('../input/gdp-20152019-finland-norway-and-sweden')

train = pd.read_csv(
    data_dir / 'train.csv',
    dtype={
        'country': 'category',
        'store': 'category',
        'product': 'category',
        'num_sold': 'float32',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
    index_col='row_id'
)

test = pd.read_csv(
    data_dir / "test.csv",
    dtype={
        'country': 'category',
        'store': 'category',
        'product': 'category',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
    index_col='row_id'
)

target_col = train.columns.difference(test.columns)[0]

holiday_data = pd.read_csv(holiday_dir / 'holidays.csv')

gdp = pd.read_csv(
    gdp_dir / 'GDP_data_2015_to_2019_Finland_Norway_Sweden.csv', index_col='year')

In [None]:
# Categorical features
categorical_cols = train.select_dtypes('category').columns.tolist() # country, store, product

## Pre-Processing

In [None]:
K_FOLDS = 3
GDP_EXPONENT = 1.2120618918594863 
# c.f https://www.kaggle.com/ambrosm/tpsjan22-03-linear-model

gdp.columns = gdp.columns.str[4:]
gdp = gdp.apply(lambda x: x**GDP_EXPONENT)
scaler = gdp.iloc[K_FOLDS+1] / gdp
gdp_map = scaler.stack().to_dict()

train[target_col] = pd.Series(
    list(zip(train.date.dt.year,train.country))
).map(gdp_map) * train[target_col]

train[target_col] = np.log1p(train.num_sold)

In [None]:
# Event / Holidays for Finland, Sweden and Norway
holiday_data.head(5)

In [None]:
def holiday_features(holiday_df, df):
    
    fin_holiday = holiday_df.loc[holiday_df.country == 'Finland']
    swe_holiday = holiday_df.loc[holiday_df.country == 'Sweden']
    nor_holiday = holiday_df.loc[holiday_df.country == 'Norway']
    
    df['fin holiday'] = df.date.isin(fin_holiday.date).astype(int)
    df['swe holiday'] = df.date.isin(swe_holiday.date).astype(int)
    df['nor holiday'] = df.date.isin(nor_holiday.date).astype(int)
    
    df['holiday'] = np.zeros(df.shape[0]).astype(int)
    
    df.loc[df.country == 'Finland', 'holiday'] = df.loc[df.country == 'Finland', 'fin holiday']
    df.loc[df.country == 'Sweden', 'holiday'] = df.loc[df.country == 'Sweden', 'swe holiday']
    df.loc[df.country == 'Norway', 'holiday'] = df.loc[df.country == 'Norway', 'nor holiday']
    
    df.drop(['fin holiday', 'swe holiday', 'nor holiday'], axis=1, inplace=True)
    
    # Easter
    easter_date = df.date.apply(lambda date: pd.Timestamp(easter.easter(date.year)))
    df['days_from_easter'] = (df.date - easter_date).dt.days.clip(-5, 65)
    
    # Last Sunday of May (Mother's Day)
    sun_may_date = df.date.dt.year.map({
        2015: pd.Timestamp(('2015-5-31')),
        2016: pd.Timestamp(('2016-5-29')),
        2017: pd.Timestamp(('2017-5-28')),
        2018: pd.Timestamp(('2018-5-27')),
        2019: pd.Timestamp(('2019-5-26'))
    })
    #new_df['days_from_sun_may'] = (df.date - sun_may_date).dt.days.clip(-1, 9)
    
    # Last Wednesday of June
    wed_june_date = df.date.dt.year.map({
        2015: pd.Timestamp(('2015-06-24')),
        2016: pd.Timestamp(('2016-06-29')),
        2017: pd.Timestamp(('2017-06-28')),
        2018: pd.Timestamp(('2018-06-27')),
        2019: pd.Timestamp(('2019-06-26'))
    })
    df['days_from_wed_jun'] = (df.date - wed_june_date).dt.days.clip(-5, 5)
    
    # First Sunday of November (second Sunday is Father's Day)
    sun_nov_date = df.date.dt.year.map({
        2015: pd.Timestamp(('2015-11-1')),
        2016: pd.Timestamp(('2016-11-6')),
        2017: pd.Timestamp(('2017-11-5')),
        2018: pd.Timestamp(('2018-11-4')),
        2019: pd.Timestamp(('2019-11-3'))
    })
    df['days_from_sun_nov'] = (df.date - sun_nov_date).dt.days.clip(-1, 9)
    
    return df

train = holiday_features(holiday_data, train)
test  = holiday_features(holiday_data, test)

In [None]:
train.date.dt.dayofyear

In [None]:
def  fourier_features(df):
    dayofyear = df.date.dt.dayofyear
    for k in range(1, 5):
        df[f'sin{k}'] = np.sin(dayofyear / 365.25 * 2 * math.pi * k)
        df[f'cos{k}'] = np.cos(dayofyear / 365.25 * 2 * math.pi * k)
        df[f'mug_sin{k}'] = df[f'sin{k}'] * df['product_Kaggle Mug']
        df[f'mug_cos{k}'] = df[f'cos{k}'] * df['product_Kaggle Mug']
        df[f'hat_sin{k}'] = df[f'sin{k}'] * df['product_Kaggle Hat']
        df[f'hat_cos{k}'] = df[f'cos{k}'] * df['product_Kaggle Hat']
    return df

In [None]:
train = pd.get_dummies(train, columns=categorical_cols)
test  = pd.get_dummies(test, columns=categorical_cols)

In [None]:
# Add Fourrier features
train = fourier_features(train)
test = fourier_features(test)

In [None]:
train.head()

In [None]:
def new_date_features(df):
    df['year'] = df.date.dt.year 
    df['quarter'] = df.date.dt.quarter
    df['month'] = df.date.dt.month  
    df['week'] = df.date.dt.week 
    df['day'] = df.date.dt.day  
    df['weekday'] = df.date.dt.weekday
#     df['day_of_week'] = df.date.dt.dayofweek  
    df['day_of_year'] = df.date.dt.dayofyear  
#     df['week_of_year'] = df.date.dt.weekofyear
    df['day_of_month'] = df.date.dt.days_in_month  
    df['is_weekend'] = np.where((df['weekday'] == 5) | (df['weekday'] == 6), 1, 0)
    df['is_friday'] = np.where((df['weekday'] == 4), 1, 0)
    
    df.drop('date', axis=1, inplace=True)
    
    return df
    

In [None]:
# Add Date features
train = new_date_features(train)
test  = new_date_features(test)

In [None]:
def smape(actual, predicted):
    numerator = np.abs(predicted - actual)
    denominator = (np.abs(actual) + np.abs(predicted)) / 2
    
    return np.mean(numerator / denominator)*100

In [None]:
train

In [None]:
NB_MODELS = 4

models = []

for i in range (NB_MODELS):
    print ('Fit Model', i)
    reg = setup(
        data = train,
        target = target_col,
        data_split_shuffle = False, 
        create_clusters = False,
        fold_strategy = 'groupkfold',
        fold_groups = 'year',
        use_gpu = True,
        silent = True,
        fold = K_FOLDS,
        normalize = True,
        n_jobs = -1,
    )
    
    add_metric('SMAPE', 'SMAPE', smape, greater_is_better=False)
    
    models.append(create_model('catboost'))

In [None]:
blend = blend_models(models)

In [None]:
final_blend = finalize_model(blend)

In [None]:
# Fit-Based Weights Geo-Rounded
# from https://www.kaggle.com/fergusfindley/ensembling-and-rounding-techniques-comparison
def geometric_round(arr):
    result_array = arr
    result_array = np.where(result_array < np.sqrt(np.floor(arr)*np.ceil(arr)), np.floor(arr), result_array)
    result_array = np.where(result_array >= np.sqrt(np.floor(arr)*np.ceil(arr)), np.ceil(arr), result_array)

    return result_array

In [None]:
y_pred = np.expm1(
    predict_model(final_blend, data=test)['Label']
)

y_pred = geometric_round(np.array(y_pred).transpose()).astype(int)
y_pred

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-jan-2022/sample_submission.csv')
submission[target_col] = y_pred

submission.to_csv('submission.csv', index=False)