# Libraries

In [None]:
import numpy as np
import pandas as pd

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 999)

import matplotlib.pyplot as plt
import seaborn as sns

import itertools


In [None]:
!pip install pystan==2.19.1.1
!pip install prophet

from prophet import Prophet

# Load the data

In [None]:
train = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv')
train['date'] = pd.to_datetime(train.date)
print(train.info())
train.head()

In [None]:
test = pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv')
test['date'] = pd.to_datetime(test.date)
print(test.info())
test.head()

# Competition Metric
https://www.kaggle.com/cpmpml/smape-weirdness

In [None]:
def SMAPE(y_true, y_pred):
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)

# Summary

In [None]:
sns.relplot(data=train, x='date', y='num_sold', row='country', col='store', hue='product',
            aspect=3, height=2.5, kind='line')

# Train Test Val Split

In [None]:
val = train[train.date >= '2018-01-01'].copy()
val.reset_index(drop=True, inplace=True)

train = train[train.date < '2018-01-01'].copy()
train.reset_index(drop=True, inplace=True)

train.rename({'date':'ds', 'num_sold':'y'}, axis=1, inplace=True)
val.rename({'date':'ds', 'num_sold':'y'}, axis=1, inplace=True)
test.rename({'date':'ds'}, axis=1, inplace=True)

print('Train', train.shape, '| Start', train.ds.min(), '| End', train.ds.max())
print('Val', val.shape, '| Start', val.ds.min(), '| End', val.ds.max())
print('Test', test.shape, '| Start', test.ds.min(), '| End', test.ds.max())

# Base Prophet

In [None]:
for country in train.country.unique():
    for store in train.store.unique():
        for product in train['product'].unique():    
            print(country, store, product)
            
            # Subsets for current country, stores and product
            train_idx = train[(train.country==country) & 
                              (train.store==store) &
                              (train['product']==product)].index
            
            train_sub = train.loc[train_idx].copy()
            
            val_idx = val[(val.country==country) & 
                          (val.store==store) &
                          (val['product']==product)].index
            
            val_sub = val.loc[val_idx].copy()
            
            # Define the model and fit it on the train subset of data
            model = Prophet()
            model.fit(train_sub)
            
            # Predict for train e validation datasets
            train_preds = model.predict(train_sub)
            val_preds = model.predict(val_sub)
            
            # Calculate scores base on comp metric SMAPE
            train_score = SMAPE(train_sub.y.values, train_preds.yhat.values)
            val_score = SMAPE(val_sub.y.values, val_preds.yhat.values)
            
            print()
            print('--------------------------------------------------------------------------')
            print('Train Score', country, store, product, 'SMAPE: {:f}'.format(train_score))
            print('Val Score', country, store, product, 'SMAPE: {:f}'.format(val_score))
            print('--------------------------------------------------------------------------')
            print()            
            
            # Add predictions to train and validation datasets
            train.loc[train_idx, 'yhat'] = train_preds.yhat.values
            val.loc[val_idx, 'yhat'] = val_preds.yhat.values

print()
print('--------------------------------------------------------------------------')
print('Train Score', 'SMAPE: {:f}'.format(SMAPE(train.y.values, train.yhat.values)))
print('Val Score', 'SMAPE: {:f}'.format(SMAPE(val.y.values, val.yhat.values)))
print('--------------------------------------------------------------------------')
print()

## Add Holidays
https://www.kaggle.com/gunesevitan/tabular-playground-series-jan-2022-prophet

In [None]:
new_year = pd.DataFrame({
  'holiday': 'new_year',
  'ds': pd.to_datetime(['2015-01-01', '2016-01-01', '2017-01-01', '2018-01-01', '2019-01-01']),
  'lower_window': -1,
  'upper_window': 0,
})

easter = pd.DataFrame({
  'holiday': 'easter',
  'ds': pd.to_datetime(['2015-04-05', '2016-03-27', '2017-04-16', '2018-04-01', '2019-04-21']),
  'lower_window': 0,
  'upper_window': 7,
})

holidays = pd.concat((new_year, easter))
holidays

In [None]:
for country in train.country.unique():
    for store in train.store.unique():
        for product in train['product'].unique():    
            print(country, store, product)
            
            # Subsets for current country, stores and product
            train_idx = train[(train.country==country) & 
                              (train.store==store) &
                              (train['product']==product)].index
            
            train_sub = train.loc[train_idx].copy()
            
            val_idx = val[(val.country==country) & 
                          (val.store==store) &
                          (val['product']==product)].index
            
            val_sub = val.loc[val_idx].copy()
            
            # Define the model and fit it on the train subset of data
            model = Prophet(holidays=holidays)
            model.fit(train_sub)
            
            # Predict for train e validation datasets
            train_preds = model.predict(train_sub)
            val_preds = model.predict(val_sub)
            
            # Calculate scores base on comp metric SMAPE
            train_score = SMAPE(train_sub.y.values, train_preds.yhat.values)
            val_score = SMAPE(val_sub.y.values, val_preds.yhat.values)
            
            print()
            print('--------------------------------------------------------------------------')
            print('Train Score', country, store, product, 'SMAPE: {:f}'.format(train_score))
            print('Val Score', country, store, product, 'SMAPE: {:f}'.format(val_score))
            print('--------------------------------------------------------------------------')
            print()            
            
            # Add predictions to train and validation datasets
            train.loc[train_idx, 'yhat'] = train_preds.yhat.values
            val.loc[val_idx, 'yhat'] = val_preds.yhat.values

print()
print('--------------------------------------------------------------------------')
print('Train Score', 'SMAPE: {:f}'.format(SMAPE(train.y.values, train.yhat.values)))
print('Val Score', 'SMAPE: {:f}'.format(SMAPE(val.y.values, val.yhat.values)))
print('--------------------------------------------------------------------------')
print()

## Tuned Parameters
https://www.kaggle.com/gunesevitan/tabular-playground-series-jan-2022-prophet

In [None]:
for country in train.country.unique():
    for store in train.store.unique():
        for product in train['product'].unique():    
            print(country, store, product)
            
            # Subsets for current country, stores and product
            train_idx = train[(train.country==country) & 
                              (train.store==store) &
                              (train['product']==product)].index
            
            train_sub = train.loc[train_idx].copy()
            
            val_idx = val[(val.country==country) & 
                          (val.store==store) &
                          (val['product']==product)].index
            
            val_sub = val.loc[val_idx].copy()
            
            # Define the model and fit it on the train subset of data
            model = Prophet(
                growth='linear',
                holidays=holidays,
                n_changepoints=10,
                changepoint_range=0.4,
                yearly_seasonality=True,
                weekly_seasonality=True,
                daily_seasonality=False,
                seasonality_mode='additive',
                seasonality_prior_scale=25,
                holidays_prior_scale=100,
                changepoint_prior_scale=0.01,
                interval_width=0.5,
                uncertainty_samples=False
            )
            model.fit(train_sub)
            
            # Predict for train e validation datasets
            train_preds = model.predict(train_sub)
            val_preds = model.predict(val_sub)
            
            # Calculate scores base on comp metric SMAPE
            train_score = SMAPE(train_sub.y.values, train_preds.yhat.values)
            val_score = SMAPE(val_sub.y.values, val_preds.yhat.values)
            
            print()
            print('--------------------------------------------------------------------------')
            print('Train Score', country, store, product, 'SMAPE: {:f}'.format(train_score))
            print('Val Score', country, store, product, 'SMAPE: {:f}'.format(val_score))
            print('--------------------------------------------------------------------------')
            print()            
            
            # Add predictions to train and validation datasets
            train.loc[train_idx, 'yhat'] = train_preds.yhat.values
            val.loc[val_idx, 'yhat'] = val_preds.yhat.values

print()
print('--------------------------------------------------------------------------')
print('Train Score', 'SMAPE: {:f}'.format(SMAPE(train.y.values, train.yhat.values)))
print('Val Score', 'SMAPE: {:f}'.format(SMAPE(val.y.values, val.yhat.values)))
print('--------------------------------------------------------------------------')
print()

# Rounding

In [None]:
for country in train.country.unique():
    for store in train.store.unique():
        for product in train['product'].unique():    
            print(country, store, product)
            
            # Subsets for current country, stores and product
            train_idx = train[(train.country==country) & 
                              (train.store==store) &
                              (train['product']==product)].index
            
            train_sub = train.loc[train_idx].copy()
            
            val_idx = val[(val.country==country) & 
                          (val.store==store) &
                          (val['product']==product)].index
            
            val_sub = val.loc[val_idx].copy()
            
            # Define the model and fit it on the train subset of data
            model = Prophet(
                growth='linear',
                holidays=holidays,
                n_changepoints=10,
                changepoint_range=0.4,
                yearly_seasonality=True,
                weekly_seasonality=True,
                daily_seasonality=False,
                seasonality_mode='additive',
                seasonality_prior_scale=25,
                holidays_prior_scale=100,
                changepoint_prior_scale=0.01,
                interval_width=0.5,
                uncertainty_samples=False
            )
            model.fit(train_sub)
            
            # Predict for train e validation datasets
            train_preds = model.predict(train_sub)
            val_preds = model.predict(val_sub)
            
            # Calculate scores base on comp metric SMAPE
            train_score = SMAPE(train_sub.y.values, train_preds.yhat.values)
            val_score = SMAPE(val_sub.y.values, val_preds.yhat.values)
            
            print()
            print('--------------------------------------------------------------------------')
            print('Train Score', country, store, product, 'SMAPE: {:f}'.format(train_score))
            print('Val Score', country, store, product, 'SMAPE: {:f}'.format(val_score))
            print('--------------------------------------------------------------------------')
            print()            
            
            # Add predictions to train and validation datasets
            train.loc[train_idx, 'yhat'] = np.round(train_preds.yhat.values, 0)
            val.loc[val_idx, 'yhat'] = np.round(val_preds.yhat.values, 0)

print()
print('--------------------------------------------------------------------------')
print('Train Score', 'SMAPE: {:f}'.format(SMAPE(train.y.values, train.yhat.values)))
print('Val Score', 'SMAPE: {:f}'.format(SMAPE(val.y.values, val.yhat.values)))
print('--------------------------------------------------------------------------')
print()

# Ceiling

In [None]:
for country in train.country.unique():
    for store in train.store.unique():
        for product in train['product'].unique():    
            print(country, store, product)
            
            # Subsets for current country, stores and product
            train_idx = train[(train.country==country) & 
                              (train.store==store) &
                              (train['product']==product)].index
            
            train_sub = train.loc[train_idx].copy()
            
            val_idx = val[(val.country==country) & 
                          (val.store==store) &
                          (val['product']==product)].index
            
            val_sub = val.loc[val_idx].copy()
            
            # Define the model and fit it on the train subset of data
            model = Prophet(
                growth='linear',
                holidays=holidays,
                n_changepoints=10,
                changepoint_range=0.4,
                yearly_seasonality=True,
                weekly_seasonality=True,
                daily_seasonality=False,
                seasonality_mode='additive',
                seasonality_prior_scale=25,
                holidays_prior_scale=100,
                changepoint_prior_scale=0.01,
                interval_width=0.5,
                uncertainty_samples=False
            )
            model.fit(train_sub)
            
            # Predict for train e validation datasets
            train_preds = model.predict(train_sub)
            val_preds = model.predict(val_sub)
            
            # Calculate scores base on comp metric SMAPE
            train_score = SMAPE(train_sub.y.values, train_preds.yhat.values)
            val_score = SMAPE(val_sub.y.values, val_preds.yhat.values)
            
            print()
            print('--------------------------------------------------------------------------')
            print('Train Score', country, store, product, 'SMAPE: {:f}'.format(train_score))
            print('Val Score', country, store, product, 'SMAPE: {:f}'.format(val_score))
            print('--------------------------------------------------------------------------')
            print()            
            
            # Add predictions to train and validation datasets
            train.loc[train_idx, 'yhat'] = np.ceil(train_preds.yhat.values)
            val.loc[val_idx, 'yhat'] = np.ceil(val_preds.yhat.values)

print()
print('--------------------------------------------------------------------------')
print('Train Score', 'SMAPE: {:f}'.format(SMAPE(train.y.values, train.yhat.values)))
print('Val Score', 'SMAPE: {:f}'.format(SMAPE(val.y.values, val.yhat.values)))
print('--------------------------------------------------------------------------')
print()

# Final Training

In [None]:
all_train = pd.concat([train, val], axis=0, ignore_index=True).reset_index(drop=True)

for country in all_train.country.unique():
    for store in all_train.store.unique():
        for product in all_train['product'].unique():    
            print(country, store, product)
            
            # Subsets for current country, stores and product
            train_idx = all_train[(all_train.country==country) & 
                                  (all_train.store==store) &
                                  (all_train['product']==product)].index
            
            train_sub = all_train.loc[train_idx].copy()
            
            test_idx = test[(test.country==country) & 
                            (test.store==store) &
                            (test['product']==product)].index
            
            test_sub = test.loc[test_idx].copy()
            
            # Define the model and fit it on the train subset of data
            model = Prophet(
                growth='linear',
                holidays=holidays,
                n_changepoints=10,
                changepoint_range=0.4,
                yearly_seasonality=True,
                weekly_seasonality=True,
                daily_seasonality=False,
                seasonality_mode='additive',
                seasonality_prior_scale=25,
                holidays_prior_scale=100,
                changepoint_prior_scale=0.01,
                interval_width=0.5,
                uncertainty_samples=False
            )
            model.fit(train_sub)
            
            # Predict for train e validation datasets
            train_preds = model.predict(train_sub)
            test_preds = model.predict(test_sub)
            
            # Calculate scores base on comp metric SMAPE
            train_score = SMAPE(train_sub.y.values, train_preds.yhat.values)
            
            print()
            print('--------------------------------------------------------------------------')
            print('Train Score', country, store, product, 'SMAPE: {:f}'.format(train_score))
            print('--------------------------------------------------------------------------')
            print()            
            
            # Add predictions to train and validation datasets
            all_train.loc[train_idx, 'yhat'] = np.round(train_preds.yhat.values, 0)
            test.loc[test_idx, 'yhat'] = np.round(test_preds.yhat.values, 0)

print()
print('--------------------------------------------------------------------------')
print('Train Score', 'SMAPE: {:f}'.format(SMAPE(all_train.y.values, all_train.yhat.values)))
print('--------------------------------------------------------------------------')
print()

# Tunning Parameters

In [None]:
# growth='linear',
# holidays=holidays,
# n_changepoints=10,
# changepoint_range=0.4,
# yearly_seasonality=True,
# weekly_seasonality=True,
# daily_seasonality=False,
# seasonality_mode='additive',
# seasonality_prior_scale=25,
# holidays_prior_scale=100,
# changepoint_prior_scale=0.01,
# interval_width=0.5,
# uncertainty_samples=False

In [None]:
# from prophet.diagnostics import cross_validation
# from prophet.diagnostics import performance_metrics

# all_data = pd.concat([train, val], axis=0, ignore_index=True).reset_index(drop=True)

# df = all_data[(all_data.country=='Finland') & 
#               (all_data.store=='KaggleMart') &
#               (all_data['product']=='Kaggle Mug')].copy()

# param_grid = {  
#     'changepoint_prior_scale': [0.001, 0.01, 0.1],
#     'seasonality_prior_scale': [0.01, 0.1, 1, 10, 25],
#     'holidays_prior_scale':[0.01, 0.1, 1, 10],
#     'changepoint_range':[0.7, 0.8, 0.9],
#     'holidays':[holidays]
# }

# cutoffs = pd.to_datetime(['2015-12-31', '2016-12-31', '2017-12-31'])

# # Generate all combinations of parameters
# all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]
# print(len(all_params))
# smapes = []  # Store the RMSEs for each params here

In [None]:
# # Use cross validation to evaluate all parameters
# for params in all_params:
#     m = Prophet(**params).fit(df)  # Fit model with given params
#     df_cv = cross_validation(m, initial=1095, cutoffs=cutoffs, horizon='365 days', parallel="processes")
#     df_p = performance_metrics(df_cv, rolling_window=1)
#     smapes.append(df_p['smape'].values[0])

# # Find the best parameters
# tuning_results = pd.DataFrame(all_params)
# tuning_results['smape'] = smapes

In [None]:
# tuning_results.sort_values('smape', ascending=False).head(50)

# Submission

In [None]:
submission = test[['row_id', 'yhat']].copy()
submission.rename({'yhat':'num_sold'}, axis=1, inplace=True)
submission['num_sold'] = np.ceil(submission.num_sold)
submission.describe()

In [None]:
submission.head()

In [None]:
submission.tail()

In [None]:
submission.to_csv('submission.csv', index=False)