In [None]:
import numpy as np 
import pandas as pd 
pd.options.display.max_columns = 20
import os
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor
from colorama import Fore, Back, Style
import seaborn as sns
import plotly.express as px
import matplotlib
from matplotlib.patches import Patch
from matplotlib import pyplot as plt
plt.rcParams.update({'figure.max_open_warning': 0})
plt.style.use('fivethirtyeight')
cmap_data = plt.cm.Paired
cmap_cv = plt.cm.coolwarm
import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv')
test = pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv')
np.unique(train['country']),np.unique(train['store'])

<h2> Transform data to timeserie with unique dates

In [None]:
target=pd.DataFrame()

#Mug Fin Mart : Mug sold in Finland by KaggleMart
#Mug hat Mart : Hat sold in Finland by KaggleMart
#Mug sti Mart : Sitcker sold in Finland by KaggleMart...

# FINLAND :
target['Mug Fin Mart'] = train[((((train['product'] == 'Kaggle Mug') & (train['country']=='Finland'))==True) & (train['store']=='KaggleMart')==True)].groupby('date').sum()['num_sold']
target['Hat Fin Mart'] = train[((((train['product'] == 'Kaggle Hat') & (train['country']=='Finland'))==True) & (train['store']=='KaggleMart')==True)].groupby('date').sum()['num_sold']
target['Sti Fin Mart'] = train[((((train['product'] == 'Kaggle Sticker') & (train['country']=='Finland'))==True) & (train['store']=='KaggleMart')==True)].groupby('date').sum()['num_sold']
target['Mug Fin Rama'] = train[((((train['product'] == 'Kaggle Mug') & (train['country']=='Finland'))==True) & (train['store']=='KaggleRama')==True)].groupby('date').sum()['num_sold']
target['Hat Fin Rama'] = train[((((train['product'] == 'Kaggle Hat') & (train['country']=='Finland'))==True) & (train['store']=='KaggleRama')==True)].groupby('date').sum()['num_sold']
target['Sti Fin Rama'] = train[((((train['product'] == 'Kaggle Sticker') & (train['country']=='Finland'))==True) & (train['store']=='KaggleRama')==True)].groupby('date').sum()['num_sold']

# NORWAY:
target['Mug Nor Mart'] = train[((((train['product'] == 'Kaggle Mug') & (train['country']=='Norway'))==True) & (train['store']=='KaggleMart')==True)].groupby('date').sum()['num_sold']
target['Hat Nor Mart'] = train[((((train['product'] == 'Kaggle Hat') & (train['country']=='Norway'))==True) & (train['store']=='KaggleMart')==True)].groupby('date').sum()['num_sold']
target['Sti Nor Mart'] = train[((((train['product'] == 'Kaggle Sticker') & (train['country']=='Norway'))==True) & (train['store']=='KaggleMart')==True)].groupby('date').sum()['num_sold']
target['Mug Nor Rama'] = train[((((train['product'] == 'Kaggle Mug') & (train['country']=='Norway'))==True) & (train['store']=='KaggleRama')==True)].groupby('date').sum()['num_sold']
target['Hat Nor Rama'] = train[((((train['product'] == 'Kaggle Hat') & (train['country']=='Norway'))==True) & (train['store']=='KaggleRama')==True)].groupby('date').sum()['num_sold']
target['Sti Nor Rama'] = train[((((train['product'] == 'Kaggle Sticker') & (train['country']=='Norway'))==True) & (train['store']=='KaggleRama')==True)].groupby('date').sum()['num_sold']

# SWEDEN:
target['Mug Swe Mart'] = train[((((train['product'] == 'Kaggle Mug') & (train['country']=='Sweden'))==True) & (train['store']=='KaggleMart')==True)].groupby('date').sum()['num_sold']
target['Hat Swe Mart'] = train[((((train['product'] == 'Kaggle Hat') & (train['country']=='Sweden'))==True) & (train['store']=='KaggleMart')==True)].groupby('date').sum()['num_sold']
target['Sti Swe Mart'] = train[((((train['product'] == 'Kaggle Sticker') & (train['country']=='Sweden'))==True) & (train['store']=='KaggleMart')==True)].groupby('date').sum()['num_sold']
target['Mug Swe Rama'] = train[((((train['product'] == 'Kaggle Mug') & (train['country']=='Sweden'))==True) & (train['store']=='KaggleRama')==True)].groupby('date').sum()['num_sold']
target['Hat Swe Rama'] = train[((((train['product'] == 'Kaggle Hat') & (train['country']=='Sweden'))==True) & (train['store']=='KaggleRama')==True)].groupby('date').sum()['num_sold']
target['Sti Swe Rama'] = train[((((train['product'] == 'Kaggle Sticker') & (train['country']=='Sweden'))==True) & (train['store']=='KaggleRama')==True)].groupby('date').sum()['num_sold']

target.index = np.arange(0,target.shape[0],1).tolist()
target.head(3)

<h2> Some lists to visualize our data

In [None]:
Rama = [col for col in target.columns if 'Rama' in col]
Rama_swe = [col for col in target.columns if ('Rama' in col) & ('Swe' in col)]
Rama_fin = [col for col in target.columns if ('Rama' in col) & ('Fin' in col)]
Rama_nor = [col for col in target.columns if ('Rama' in col) & ('Nor' in col)]

Mart = [col for col in target.columns if 'Mart' in col]
Mart_swe = [col for col in target.columns if ('Mart' in col) & ('Swe' in col)]
Mart_fin = [col for col in target.columns if ('Mart' in col) & ('Fin' in col)]
Mart_nor = [col for col in target.columns if ('Mart' in col) & ('Nor' in col)]

def show_me(data) :
    fig_dims = (20,10)
    fig, ax = plt.subplots(figsize=fig_dims)
    sns.set_theme(style="whitegrid")
    dates = pd.date_range("1 1 2015", periods=365, freq="D")
    dates = pd.date_range(start='1/1/2015', end='31/12/2018',  freq="D")
    data.index = dates
    sns.lineplot(data=data, palette="tab10", linewidth=1)

<h3> Sales by KaggleRama

In [None]:
show_me(target[Rama]) # Sales for all countries

In [None]:
show_me(target[Rama_fin]) # sales for KaggleRAmma in Finland

<h3> Sales by KaggleMart

In [None]:
show_me(target[Mart]) # Sales for KaggleMart in all countries

In [None]:
show_me(target[Mart_fin]) # sales for KaggleMarte in Finland

We can see that every product for every country and shop has a seasonality and a small trend

<h2> Data Engineering

In [None]:
train_data = pd.DataFrame()
train_data['date'] = np.unique(train['date']).tolist()
train_data['date'] = pd.to_datetime(train_data['date'])
train_data['year'] = train_data['date'].dt.year
train_data['month'] = train_data['date'].dt.month
train_data['day'] = train_data['date'].dt.day
train_data['dayofweek'] = train_data['date'].dt.dayofweek
train_data['dayofmonth'] = train_data['date'].dt.days_in_month
train_data['dayofyear'] = train_data['date'].dt.dayofyear
train_data['weekday'] = train_data['date'].dt.weekday

test_data = pd.DataFrame()
test_data['date'] = np.unique(test['date']).tolist()
test_data['date'] = pd.to_datetime(test_data['date'])
test_data['year'] = test_data['date'].dt.year
test_data['month'] = test_data['date'].dt.month
test_data['day'] = test_data['date'].dt.day
test_data['dayofweek'] = test_data['date'].dt.dayofweek
test_data['dayofmonth'] = test_data['date'].dt.days_in_month
test_data['dayofyear'] = test_data['date'].dt.dayofyear
test_data['weekday'] = test_data['date'].dt.weekday

train_data.drop('date', axis = 1, inplace = True)
test_data.drop('date', axis = 1, inplace = True)
train_data.shape,test_data.shape

In [None]:
train_data.head(3)

<h2> Seasonality

It seems clear that the seasonality is annual, but let's confirm it
We are going to transform our data to get a 'stationary serie'

In [None]:
y=pd.DataFrame()
y['num_sold'] = target['Mug Fin Mart']
y.index = pd.to_datetime(target['Mug Fin Mart'].index.tolist())

In [None]:
from statsmodels.tsa.stattools import adfuller
dftest = adfuller(y, autolag = 'AIC')
print("1. ADF : ",dftest[0])
print("2. P-Value : ", dftest[1])

The P_value is much higher than 0.05 so it is not stationary, let's improve it

In [None]:
# We expect a yearly seasonality : 12 (months)
fig_dims = (20,7)
rolling_mean = y.rolling(window = 12).mean()
y['rolling_mean_diff'] = rolling_mean - rolling_mean.shift()
ax1 = plt.subplot()
y['rolling_mean_diff'].plot(title='after rolling mean & differencing',figsize=fig_dims);
ax2 = plt.subplot()
y.plot(title='original',figsize=fig_dims);

In [None]:
dftest = adfuller(y['rolling_mean_diff'].dropna(), autolag = 'AIC')
print("1. ADF : ",dftest[0])
print("2. P-Value : ", dftest[1])

We can see that the p-value is less than 0.05
So our time series is stationary.

<h2> Let's split our dataset with seasonality for the training by year

We train with the previous year and validate with the next year

<h3> Split with training with and 2015 validation : 2016

In [None]:
y=pd.DataFrame()
y['num_sold'] = target['Mug Fin Mart'] # as example
y.index = target.index

In [None]:
# Green = Training
# Blue = validation set

i=0
index_train_start = i*365
index_train_end = (1+i)*365
index_valid_start = (1+i)*365
index_valid_end = (2+i)*365
y['num_sold'].iloc[index_train_start:index_valid_start].plot(figsize=(10,3),linewidth=1, color='green')
y['num_sold'].iloc[index_valid_start:index_valid_end].plot(figsize=(10,3),linewidth=1,color='blue')

<h3> Split with training with 2016 and validation : 2017

In [None]:
i=1
index_train_start = i*365
index_train_end = (1+i)*365
index_valid_start = (1+i)*365
index_valid_end = (2+i)*365
y['num_sold'].iloc[index_train_start:index_valid_start].plot(figsize=(10,3),linewidth=1, color='green')
y['num_sold'].iloc[index_valid_start:index_valid_end].plot(figsize=(10,3),linewidth=1,color='blue')

<h3> Split with training with 2017 and validation 2018

In [None]:
i=2
index_train_start = i*365
index_train_end = (1+i)*365
index_valid_start = (1+i)*365
index_valid_end = (2+i)*365
y['num_sold'].iloc[index_train_start:index_valid_start].plot(figsize=(10,3),linewidth=1, color='green')
y['num_sold'].iloc[index_valid_start:index_valid_end].plot(figsize=(10,3),linewidth=1,color='blue')

<h2> <h2> Let's split our dataset by cumulative years

In [None]:
# Green = Training
# Blue = validation set
INDEX =train_data.index.tolist()
i=0
index_train_start = 0
index_train_end = (1+i)*365
index_valid_start = (1+i)*365
index_valid_end = (2+i)*365
y['num_sold'].iloc[index_train_start:index_valid_start].plot(figsize=(10,3),linewidth=1, color='green')
y['num_sold'].iloc[index_valid_start:index_valid_end].plot(figsize=(10,3),linewidth=1,color='blue')

In [None]:
i=1
index_train_start = 0
index_train_end = (1+i)*365
index_valid_start = (1+i)*365
index_valid_end = (2+i)*365
y['num_sold'].iloc[index_train_start:index_valid_start].plot(figsize=(10,3),linewidth=1, color='green')
y['num_sold'].iloc[index_valid_start:index_valid_end].plot(figsize=(10,3),linewidth=1,color='blue')

In [None]:
i=2
index_train_start = 0
index_train_end = (1+i)*365
index_valid_start = (1+i)*365
index_valid_end = (2+i)*365
y['num_sold'].iloc[index_train_start:index_valid_start].plot(figsize=(10,3),linewidth=1, color='green')
y['num_sold'].iloc[index_valid_start:index_valid_end].plot(figsize=(10,3),linewidth=1,color='blue')

<h2> Let's do a Training by year

In [None]:
score_pred =[]

train_pred = np.zeros((3*365,18))
test_pred_by_year = np.zeros((test_data.shape[0],18))

for i in range (3):
    
    index_train_start = i*365
    index_train_end = (1+i)*365
    index_valid_start = (1+i)*365
    index_valid_end = (2+i)*365

    X_train, y_train = train_data.iloc[index_train_start:index_valid_start], target.iloc[index_train_start:index_valid_start]
    X_valid, y_valid = train_data.iloc[index_valid_start:index_valid_end], target.iloc[index_valid_start:index_valid_end]
    
    param1 = {   
        'learning_rate': 0.004280047845210125, 
        'depth': 5, 
        'l2_leaf_reg': 0.0010555278350981901, 
        'loss_function': 'MultiRMSE', 
        'eval_metric': 'MultiRMSE', 
        'task_type': 'CPU', 
        'iterations': 16962
        }
    
    clf = CatBoostRegressor(**param1)
    clf.fit(
                X_train, y_train,
                eval_set=[(X_valid,y_valid)],
                early_stopping_rounds = 1000,
                verbose=0)
    pred=clf.predict(X_valid)
    train_pred[i*365:(1+i)*365]=pred
    
    score = np.round(mean_squared_error(y_valid,pred))
    score_pred.append(score)
    print("fold",i+1,"score MSE =",score,"RMSE =",np.round(np.sqrt(score)))
    
    pred_test = clf.predict(test_data)
    test_pred_by_year += pred_test/3

score_total_mse = np.round((mean_squared_error(train_pred,target.iloc[366:,:])))
    
print(70*'*')
print('Score oof MSE   =',score_total_mse)
print('Score oof RMSE  =',np.round(np.sqrt(score_total_mse)))
print('Score mean MSE  =',np.round(np.mean(score_pred)))
print('Score mean RMSE =',np.round(np.sqrt(np.mean(score_pred))))
print(70*'*')  

<h2> Let's do a Training by cumulative years

In [None]:
score_pred =[]

train_pred = np.zeros((3*365,18))
test_pred_cumulative = np.zeros((test_data.shape[0],18))

for i in range (3):
    
    index_train_start = 0
    index_train_end = (1+i)*365
    index_valid_start = (1+i)*365
    index_valid_end = (2+i)*365

    X_train, y_train = train_data.iloc[index_train_start:index_valid_start], target.iloc[index_train_start:index_valid_start]
    X_valid, y_valid = train_data.iloc[index_valid_start:index_valid_end], target.iloc[index_valid_start:index_valid_end]
    
    param1 = {   
        'learning_rate': 0.004280047845210125, 
        'depth': 5, 
        'l2_leaf_reg': 0.0010555278350981901, 
        'loss_function': 'MultiRMSE', 
        'eval_metric': 'MultiRMSE', 
        'task_type': 'CPU', 
        'iterations': 16962
        }
    
    clf = CatBoostRegressor(**param1)
    clf.fit(
                X_train, y_train,
                eval_set=[(X_valid,y_valid)],
                early_stopping_rounds = 1000,
                verbose=0)
    pred=clf.predict(X_valid)
    train_pred[i*365:(1+i)*365]=pred
    
    score = np.round(mean_squared_error(y_valid,pred))
    score_pred.append(score)
    print("fold",i+1,"score MSE =",score,"RMSE =",np.round(np.sqrt(score)))
    
    pred_test = clf.predict(test_data)
    test_pred_cumulative += pred_test/3

score_total_mse = np.round((mean_squared_error(train_pred,target.iloc[366:,:])))
    
print(70*'*')
print('Score oof MSE   =',score_total_mse)
print('Score oof RMSE  =',np.round(np.sqrt(score_total_mse)))
print('Score mean MSE  =',np.round(np.mean(score_pred)))
print('Score mean RMSE =',np.round(np.sqrt(np.mean(score_pred))))
print(70*'*')  

We can see that cumulative years seems a little better than by year

<h2> Let's prepare the submission files format

In [None]:
pred_test_df_by_year = pd.DataFrame(test_pred_by_year,columns=target.columns.tolist())
pred_test_df_cumulative = pd.DataFrame(test_pred_cumulative,columns=target.columns.tolist())
display(pred_test_df_by_year.head(3))
display(pred_test_df_cumulative.head(3))

In [None]:
sub = pd.read_csv('../input/tabular-playground-series-jan-2022/sample_submission.csv')

def make_submission(df):
    submission = pd.DataFrame(data=np.zeros((sub.shape[0],2)),index = sub.index.tolist(),columns=['row_id','num_sold'])
    INDEX = -1
    for i in range(365):
        for j in range (18) :
            INDEX +=1
            submission['num_sold'].loc[INDEX,1]=df.iloc[i,j]
    submission['row_id'] = sub['row_id']
    return submission

submission_by_year = make_submission(pred_test_df_by_year)
submission_cumulative = make_submission(pred_test_df_cumulative)
display(submission_by_year.head(3))
display(submission_cumulative.head(3))

In [None]:
submission_mean=sub.copy()
submission_mean['num_sold'] = (submission_by_year['num_sold']+submission_cumulative['num_sold'])/2
submission_mean.head()

In [None]:
public_submission = pd.read_csv('../input/tps-2022-01/public_submission.csv')
submission_bonus=sub.copy()
submission_bonus['num_sold'] = (submission_mean['num_sold']+public_submission['num_sold'])/2
submission_bonus.head()

In [None]:
submission_by_year.to_csv('submission_by_year.csv',index=False)
submission_cumulative.to_csv('submission_cumulative.csv',index=False)
submission_mean.to_csv('submission_mean.csv',index=False)
submission_bonus.to_csv('submission_bonus.csv',index=False)