# TPS Jan 2022 with PyCaret

This is the second iteration of my notebook to demo how the Pycaret package can be used in this competition. 

Ideas from

https://www.kaggle.com/mfedeli/tabular-playground-series-jan-2022 - original PyCaret notebook

https://www.kaggle.com/carlmcbrideellis/gdp-20152019-finland-norway-and-sweden - country specific GDP deflator

https://www.kaggle.com/ambrosm/tpsjan22-06-lightgbm-quickstart - features to capture movable holidays

Hope you find it useful!

# Initial Setup

In [None]:
%%capture
!pip install pycaret[full]

import pandas as pd
import numpy as np 
import sys
import dateutil.easter as easter
from pycaret.regression import *

In [None]:
TEST = False
train = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv',index_col='row_id')
test = pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv',index_col='row_id')
train['date'] = pd.to_datetime(train.date)
train['year']= train.date.dt.year
print(train[['num_sold','year']].groupby('year').mean())
if TEST:
    test = train[train.year == 2018].copy()
    train.drop(test.index,inplace=True)
    kgroups = 2
else:
    kgroups = 3   

# Feature Engeneering
### GDP Deflator

In [None]:
gdp = pd.read_csv('../input/gdp-20152019-finland-norway-and-sweden/GDP_data_2015_to_2019_Finland_Norway_Sweden.csv',index_col=0)
gdp.columns = gdp.columns.str[4:]
gdp = gdp.apply(lambda x: x**1.21) # see explanation in https://www.kaggle.com/ambrosm/tpsjan22-03-linear-model/notebook
scaler = gdp.iloc[kgroups+1]/gdp
gdp_map = scaler.stack().to_dict()

In [None]:
train['num_sold']=pd.Series(list(zip(train.date.dt.year,train.country))).map(gdp_map)*train.num_sold
train['num_sold'] = np.log(train.num_sold)

### Date processing

In [None]:
def date_process(df):
    
    df['date'] = pd.to_datetime(df['date'])
    df['year'] =  df['date'].dt.year
    df['wd56'] = (df.date.dt.weekday >= 5).astype(str)+df.country
    df['wd4'] = (df.date.dt.weekday == 4).astype(str)+df.country
    df['dayofyear'] = df['date'].dt.dayofyear
    df.loc[(df.date.dt.is_leap_year) & (df.dayofyear >= 60),'dayofyear'] -= 1
    
    # Christmas
    xmas_date = df.date.dt.year.apply(lambda year: pd.Timestamp(str(year)+'-12-25'))
    df['xmas_adjust1'] = (df.date - xmas_date).dt.days.clip(lower=1,upper=6)
    df['xmas_adjust2'] = (df.date - xmas_date).dt.days.clip(lower=-2,upper=20)*1.0
          
    # New Year 
    df['newyear_adjust1'] = df.dayofyear.clip(lower=0,upper=15)
    df['newyear_adjust2'] = df.dayofyear.clip(lower=0,upper=2)
    
    # Easter
    easter_date = df.date.apply(lambda date: pd.Timestamp(easter.easter(date.year)))
    df['easter_adj']= (df.date - easter_date).dt.days.clip(lower =-3,upper = 60).astype(float)
    df.loc[df['easter_adj'].isin(range(12, 39)), 'easter_adj'] = 12 
        
    # Last Wednesday of June
    wed_june_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-06-24')),
                                         2016: pd.Timestamp(('2016-06-29')),
                                         2017: pd.Timestamp(('2017-06-28')),
                                         2018: pd.Timestamp(('2018-06-27')),
                                         2019: pd.Timestamp(('2019-06-26'))})
    df['days_from_wed_jun'] = (df.date - wed_june_date).dt.days.clip(-5, 5)
    
    #First Sunday of November (second Sunday is Father's Day)
    sun_nov_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-11-1')),
                                         2016: pd.Timestamp(('2016-11-6')),
                                         2017: pd.Timestamp(('2017-11-5')),
                                         2018: pd.Timestamp(('2018-11-4')),
                                         2019: pd.Timestamp(('2019-11-3'))})
    df['days_from_sun_nov'] = (df.date - sun_nov_date).dt.days.clip(-1, 9)
    
    df.drop(columns=['date'],inplace=True)
    

In [None]:
date_process(train)
date_process(test)

In [None]:
train.info()

# Modelling
### Fit CatBoost model

Many other models are available and can be compared with the compare_models functions

In [None]:
def SMAPE(y_true, y_pred):
    diff = np.abs(y_true - y_pred) / (y_true + np.abs(y_pred)) * 200
    return diff.mean()


NUMBER_OF_MODELS = 5
models= []
for i in range (NUMBER_OF_MODELS):
    print ('Fit Model', i)
    reg = setup(data = train,
            target = 'num_sold',
            data_split_shuffle = False, 
            create_clusters = False,
            fold_strategy = 'groupkfold',
            fold_groups = 'year',
            use_gpu = True,
            silent = True,
            fold=kgroups,
            ignore_features = ['country'],
            n_jobs = -1)
    add_metric('SMAPE', 'SMAPE', SMAPE, greater_is_better = False)
    set_config('seed', 123+i*567)
    models.append(create_model('catboost'))

### Interpret the model

In [None]:

plot_model(models[0],'feature_all')

In [None]:
interpret_model(models[0])

### Blend and finalize

In [None]:
blend = blend_models(models)

In [None]:
final_blend = finalize_model(blend)

In [None]:
plot_model(final_blend,'error')

# Submission

In [None]:
test['pred'] = np.exp(predict_model(final_blend, data=test)['Label'])
sub = pd.DataFrame(list(zip(test.index,test.pred.values)),columns = ['row_id', 'num_sold'])
sub.to_csv('submission.csv', index = False)

In [None]:
sub.head()

In [None]:
sub.mean()