#### Title
In this exercise I show one of possible ways of using Catboost for time series forecast.<br>
This is a very basic model, so please don't expect high score, you can use it as a baseline for further experments.

In [None]:
import os
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from catboost import CatBoostRegressor

plt.rcParams["figure.figsize"] = (20,10)
pd.options.display.max_rows = None
pd.options.mode.chained_assignment = None  #Disable pandas warnings

In [None]:
train_raw = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv')
test_raw = pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv')
print(train_raw.shape, test_raw.shape)

There is not SMAPE metric in sklearn, so I created the own one.<br>
As soon as SMAPE is a symmetrical metric you can send predictions and real results (A and B arguments) in any order

In [None]:
def SMAPE (A, B):
    return 100/len(A) * np.sum(2 * np.abs(B - A) / (np.abs(A) + np.abs(B)))

Here I add some basic additional features

In [None]:
train_tmp = train_raw.copy()
test_tmp = test_raw.copy()

#Convert date column to pandas datetime
train_tmp['date'] = pd.to_datetime(train_tmp['date'])
test_tmp['date'] = pd.to_datetime(test_tmp['date'])

#Day of month
train_tmp['day'] = train_tmp['date'].dt.day
test_tmp['day'] = test_tmp['date'].dt.day

#Day of year
train_tmp['day_year'] = train_tmp['date'].dt.dayofyear
test_tmp['day_year'] = train_tmp['date'].dt.dayofyear

#Month
train_tmp['month'] = train_tmp['date'].dt.month
test_tmp['month'] = test_tmp['date'].dt.month

#Day of week (0-6 for Mon-Sun)
train_tmp['week_day'] = train_tmp['date'].dt.dayofweek
test_tmp['week_day'] = test_tmp['date'].dt.dayofweek

#Week of year
train_tmp['week'] = train_tmp['date'].dt.isocalendar().week.astype(int)
test_tmp['week'] = test_tmp['date'].dt.isocalendar().week.astype(int)

#Weekend (0 if not, 1 if yes)
train_tmp['weekend'], test_tmp['weekend'] = 0, 0
train_tmp.loc[train_tmp['week_day'] >= 5, 'weekend'] = 1
test_tmp.loc[test_tmp['week_day'] >= 5, 'weekend'] = 1

#### Segments encoding
We have 3 contries, 2 stores and 3 products. It means that actually we have 3 x 2 x 3 = 18 segments and have to do all operations 18 times, for each segment<br>
Here I create and encode 18 segments (0-17) from ```country```, ```store``` and ```product``` features.

In [None]:
train_tmp['seg'] = (train_tmp['country'] + train_tmp['store'] + train_tmp['product']).astype('category').cat.codes
test_tmp['seg'] = (test_tmp['country'] + test_tmp['store'] + test_tmp['product']).astype('category').cat.codes

seg_count = train_tmp['seg'].nunique()

Here the plot for all 18 segments

In [None]:
for i in range(seg_count):
    train_tmp[train_tmp['seg'] == i]['num_sold'].plot()   

#### Additional features
Fortunately, the test set continues in time with the training set that let us use 365 days lag. Here I create it for every of 18 segments.

In [None]:
#Function to create lag for one segment
def lag_365 (df, seg):
    df.loc[df['seg'] == seg, 'Lag_365'] = df.loc[df['seg'] == seg]['num_sold'].shift(365)
    return df

#Merge train and test set to get Lag data from the train set
merged = pd.concat([train_tmp, test_tmp])

#The loop to create lags for 18 sements
for i in range(seg_count):
    merged = lag_365(merged, i)

#Split train and test set back
train_tmp = merged.iloc[:len(train_tmp)]
test_tmp = merged.iloc[len(train_tmp):].drop('num_sold', axis=1)

Drop meanless features

In [None]:
train_tmp = train_tmp.drop(['country', 'store', 'product', 'row_id'], axis=1)
test_tmp = test_tmp.drop(['country', 'store', 'product', 'row_id'], axis=1)

Split validation set from train set. I wil use 2018 for validation and all the rest for training.

In [None]:
split_date = datetime.datetime(2018, 1, 1)

train = train_tmp[train_tmp['date'] < split_date]
valid = train_tmp[train_tmp['date'] >= split_date]

X_train = train.drop('num_sold', axis=1)
y_train = train[['seg','num_sold']]
X_valid = valid.drop('num_sold', axis=1)
y_valid = valid[['seg','num_sold']]

print(X_train.shape, y_train.shape, X_valid.shape, y_valid.shape)
print(train.shape, valid.shape)

#### Out of fold validation
As we have 18 segments we have to create the function to train and to validate model for every segment

In [None]:
def oof_validation (train_X, train_y, valid_X, valid_y, seg):
    
    #This is very simple model, almost by defalut
    mod = CatBoostRegressor(random_seed = 17,      #Random seed
                            thread_count = 4,      #CPU cores at Kaggle notebook
                            verbose = 0,           #Silent mode
                            eval_metric = 'SMAPE', #Copmetition metric
                            has_time = True)       #Turn off shuffle
                            
    fit = mod.fit(
                  train_X[train_X['seg'] == seg],
                  train_y[train_y['seg'] == seg]['num_sold'],
                  eval_set = (
                              valid_X[valid_X['seg'] == seg],
                              valid_y[valid_y['seg'] == seg]['num_sold']
                              ),
                  )
    pred = mod.predict(valid_X[valid_X['seg'] == seg])
    
    #Here I use my function to calculate SMAPE
    smape = SMAPE(pred, valid_y[valid_y['seg'] == seg]['num_sold'])
    print('OOF SMAPE for segment', seg, 'is:', "%.5f" % smape)
    
    #The fuction returns SMAPE for every segment
    return smape

Here I call the OOF function 18 times and collect SMAPE for each run

In [None]:
score = []
for i in range(seg_count):
    score.append(oof_validation(X_train, y_train, X_valid, y_valid, i))
print('---')
print('Mean OOF SMAPE is:', "%.5f" % (sum(score) / seg_count)) 

### Predictions
I will train the model on a full training set

In [None]:
X_train_full = train_tmp.drop('num_sold', axis=1)
y_train_full = train_tmp[['seg','num_sold']]
X_test_full = test_tmp
print(X_train_full.shape, y_train_full.shape, X_test_full.shape)

The very similiar function to generate predictions

In [None]:
def predictions (train_X, train_y, test_X, seg):
    
    mod = CatBoostRegressor(random_seed = 17,      
                            thread_count = 4,      #CPU cores at Kaggle notebook
                            verbose = 0,           #Silent mode
                            eval_metric = 'SMAPE', #Copmetition metric
                            has_time = True)       #Turn off shuffle 
                            
    fit = mod.fit(train_X[train_X['seg'] == seg], train_y[train_y['seg'] == seg]['num_sold'])
    pred = mod.predict(test_X[test_X['seg'] == seg])
    test_X.loc[test_X['seg'] == seg, 'num_sold'] = pred
    print('Predictions for segment', seg, 'complete')
    return test_X

Call the function 18 times to generate predictions for every segment

In [None]:
for j in range(seg_count):
    df = predictions(X_train_full, y_train_full, X_test_full, j)
    X_test_full.loc[X_test_full['seg'] == j, 'num_sold'] = df['num_sold']
print('---')
print('Predictions complete')
X_test_full.head()

Test plot (predictions vs. previous year) to check that prediction looks realistic

In [None]:
segment = 5 #choose any
ax = y_valid[y_valid['seg'] == segment]['num_sold'].reset_index(drop = True).plot(color = 'black')
X_test_full[X_test_full['seg'] == segment]['num_sold'].reset_index(drop = True).plot(ax=ax)

#### Submission

In [None]:
sub = pd.read_csv('../input/tabular-playground-series-jan-2022/sample_submission.csv')
sub['num_sold'] = X_test_full['num_sold']
sub.to_csv('submission.csv', index=False) 
sub.head()