# Problem statement

Goal:
Build a model for forecasting the number of sales in stores based on historical data.
Target feature - "num_sold"

Metric:
Submissions are evaluated on SMAPE between forecasts and actual values.

# Contents

* [1. Importing libraries and loading datasets](#1.0)
* [2. Primary analysis](#2.0)
* [3. Pre-processing](#3.0)
* [4. Time series analysis](#4.0)
* [5. Feature Engineering](#5.0)
* [6. Predicting](#6.0)

<a id = '1.0'></a>
# 1. Importing libraries and loading datasets

In [None]:
# install World Bank's API librari
# if you get an error, try enabling the "Internet" setting in your notebook

!pip install wbgapi

In [None]:
# importing libraries:

import wbgapi as wb

import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from catboost import CatBoostRegressor

In [None]:
# disabling warnings:

pd.options.mode.chained_assignment = None

In [None]:
# loading datasets:

train_orig = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/train.csv')
test_orig = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/test.csv')

In [None]:
# date range in datasets:

min_date = min(train_orig['date'])
max_date = max(test_orig['date'])

print(f'min date in datasets:{min_date}\nmax date in datasets:{max_date}')

In [None]:
# extra data from worldbank API:
# I couldn't figure out how to get the data link from the API, so I opened the website and copied the link from the URL.
# example link "NY.GDP.MKTP.CD"

GDP_wb_df = wb.data.DataFrame('NY.GDP.MKTP.CD', ['FIN', 'SWE', 'NOR'] , range(2015, 2020, 1))
population_wb_df = wb.data.DataFrame('SP.POP.TOTL', ['FIN', 'SWE', 'NOR'] , range(2015, 2020, 1))
perc_urban_population_wb_df = wb.data.DataFrame('SP.URB.TOTL.IN.ZS', ['FIN', 'SWE', 'NOR'] , range(2015, 2020, 1))
population_density_wb_df = wb.data.DataFrame('EN.POP.DNST', ['FIN', 'SWE', 'NOR'] , range(2015, 2020, 1))

In [None]:
print('GDP 2015 - 2019 subdataset')
display(GDP_wb_df)
print()
print('Population 2015 - 2019 subdataset')
display(population_wb_df)
print()
print('Urban population (% of total population) 2015 - 2019 subdataset')
display(perc_urban_population_wb_df)
print()
print('Population density (people per sq. km of land area) 2015 - 2019 subdataset')
display(population_density_wb_df)

<a id = '2.0'></a>
# 2. Primary analysis

In [None]:
print(f'train_orig df size - {train_orig.shape}')
print(f'test_orig df size - {test_orig.shape}')

In [None]:
# checking for missing values:

def na_values(data):
    report = data.isna().sum().to_frame()
    report = report.rename(columns = {0: 'missing_values'})
    report = report.loc[report['missing_values'] != 0]
    report['% of total'] = (report['missing_values'] / data.shape[0]).round(2)
    return report.sort_values(by = 'missing_values', ascending = False)

In [None]:
display(na_values(train_orig))
display(na_values(test_orig))

In [None]:
train_orig.info()

In [None]:
train_orig.head()

<a id = '3.0'></a>
# 3. Pre-processing

In [None]:
cat_features_list = ['country', 'store', 'product']
target_feature = 'num_sold'
data_feature = 'date'
row_index = 'row_id'
validation_size = 0.2

countrys_list = train_orig['country'].unique()
stores_list = train_orig['store'].unique()
products_list = train_orig['product'].unique()

In [None]:
# all combinations:

features_combinations = [list(item) for item in itertools.product(countrys_list, stores_list, products_list)]

In [None]:
# save train to dictionary:

full_data_dict = {}

for combination in features_combinations:
    full_data_dict[(' & '.join (combination))] = train_orig.loc[(train_orig['country'] == combination[0]) & 
                                                (train_orig['store'] == combination[1]) & 
                                                (train_orig['product'] == combination[2])]

In [None]:
# cast to datetime64 type

for key in full_data_dict:
    full_data_dict[key].loc[:,data_feature] = pd.to_datetime(
        full_data_dict[key].loc[:,data_feature].copy() , format='%Y-%m-%d')

In [None]:
# convert the "date" feature to an index.

for key in full_data_dict:
    full_data_dict[key] = full_data_dict[key].set_index(data_feature)

In [None]:
# sort the df in ascending index order

for key in full_data_dict:
    full_data_dict[key] = full_data_dict[key].sort_index()

In [None]:
# creating a subdataset and drop unnecessary features for analysis:

data_dict = full_data_dict.copy()
drop_columns = ['row_id','country','store','product']

for key in data_dict:
    data_dict[key] = data_dict[key].drop(drop_columns, axis=1)

<a id = '4.0'></a>
# 4. Time series analysis

Originally wanted to build a SARIMA model, but it turned out to be too heavy. I found a great notebook on this from the grandmaster, but too late. https://www.kaggle.com/kailex/tabular-playground-22ts

In [None]:
# check the continuity of the time series using the "is_monotonic" function.

for key in data_dict:
    print(f'Time series "{key}" is monotonic: {data_dict[key].index.is_monotonic}')

In [None]:
# checking the time series for stationarity
# use Dickey-Fuller test https://www.statsmodels.org/dev/generated/statsmodels.tsa.stattools.adfuller.html
# H0 - The time series is not stationary
# H1 - The time series is stationary

alpha = 0.05
adfuller_dict = {'stationary': [],'stationary_p-value': [],'non-stationary': [],'non-stationary_p-value': []}

for key in data_dict:
    values_seria = data_dict[key].values
    result = adfuller(values_seria)
    if result[1] < alpha:
        adfuller_dict['stationary'].append(key)
        adfuller_dict['stationary_p-value'].append(f'{result[1]:.3f}')
    else:
        adfuller_dict['non-stationary'].append(key)
        adfuller_dict['non-stationary_p-value'].append(f'{result[1]:.3f}')
        
display(pd.DataFrame.from_dict(adfuller_dict))

In [None]:
# if you need to visually analyze trends and seasonality:

plot_graphs = False

In [None]:
#Trend analysis

if plot_graphs:
    for key in data_dict:
        decomposed = seasonal_decompose(data_dict[key]) 
        dec_trend = decomposed.trend
        dec_trend.plot(figsize=(12,7))
        plt.xlabel('Date', fontsize=15)
        plt.ylabel('Number of sales ',fontsize=15)
        plt.title(f'Trend {key}', fontsize=15)
        plt.grid()
        plt.show()


In [None]:
# Seasonal analysis

if plot_graphs:
    for key in data_dict:
        decomposed = seasonal_decompose(data_dict[key])
        dec_seasonal = decomposed.seasonal    
        dec_seasonal['2018-03-01':'2018-03-31'].plot(figsize=(12,7))
        plt.xlabel('Date', fontsize=15)
        plt.ylabel('Number of sales ',fontsize=15)
        plt.title(f'Trend {key}', fontsize=15)
        plt.grid()
        plt.show()

Peaks of sales on certain days and weeks were noticed.
Weekly seasonality. A special feature of the weekend and the day of the week is required.

<a id = '5.0'></a>
# 5. Feature Engineering

In [None]:
# train and validation split

train, valid = train_test_split(train_orig, shuffle=False, test_size = validation_size)
test = test_orig.copy()

In [None]:
# cast to datetime64 type
# convert the "date" feature to an index.

def date_conversion_to_index(data,data_feature):
    data_copy = data.copy()
    data_copy.loc[:,data_feature] = pd.to_datetime(data.loc[:,data_feature], format='%Y-%m-%d')
    data_copy = data_copy.set_index(data_feature)
    return data_copy

In [None]:
train = date_conversion_to_index(train,data_feature)
valid = date_conversion_to_index(valid,data_feature)
test = date_conversion_to_index(test,data_feature)

In [None]:
def make_date_features(data):
    data_copy = data.copy()
    data_copy['year'] = data_copy.index.year
    data_copy['quarter'] = data_copy.index.quarter
    data_copy['month'] = data_copy.index.month
    data_copy['week'] = data_copy.index.isocalendar().week
    data_copy['day_of_year'] = data_copy.index.dayofyear
    data_copy['day_of_week'] = data_copy.index.dayofweek
    data_copy['weekend'] = data_copy['day_of_week'].isin([5,6])*1   
    return data_copy

In [None]:
train = make_date_features(train)
valid = make_date_features(valid)
test = make_date_features(test)

In [None]:
# using extra data from www.worldbank.org/

def make_features_from_wbdf(data,
                            data_group_column,
                            wbdf,
                            match_dict,
                            left_on_feature,
                            right_on_feature):
    
    wb_df = wbdf.T
    wb_df_columns = list(wb_df.columns)
    
    # dictionary substitution
    for item in wb_df_columns:
        if item in match_dict:
            wb_df_columns[wb_df_columns.index(item)] = match_dict[item]
    wb_df.columns = wb_df_columns
    
    # merge
    wb_df = wb_df.reset_index()
    wb_df['index'] = wb_df['index'].str.replace(r"[^\d]", "", regex=True)
    wb_df['index'] = wb_df['index'].astype('int')
    merge_data = data.merge(wb_df,
                             how = 'left',
                             left_on = left_on_feature,
                             right_on = right_on_feature)
    merge_data = merge_data.set_index(data.index)
    
    # getting a single feature
    for column in wb_df_columns:
        merge_data[column].loc[merge_data[data_group_column] != column] = np.nan
    merge_data['new_feature_name'] = merge_data[wb_df_columns].sum(axis=1)
    
    return merge_data['new_feature_name']
    

In [None]:
# dictionary for matching country names

match_dict = {'FIN' : 'Finland', 
              'SWE' : 'Sweden', 
              'NOR': 'Norway'}

In [None]:
train['population'] = make_features_from_wbdf(train,
                                              'country',
                                              population_wb_df,
                                              match_dict,
                                              'year',
                                              'index')

train['gdp'] = make_features_from_wbdf(train,
                                       'country',
                                       GDP_wb_df,
                                       match_dict,
                                       'year',
                                       'index')

train['perc_urban_population'] = make_features_from_wbdf(train,
                                                         'country',
                                                         perc_urban_population_wb_df,
                                                         match_dict,
                                                         'year',
                                                         'index')

train['population_density'] = make_features_from_wbdf(train,
                                                      'country',
                                                      population_density_wb_df,
                                                      match_dict,
                                                      'year',
                                                      'index')

In [None]:
valid['population'] = make_features_from_wbdf(valid,
                                              'country',
                                              population_wb_df,
                                              match_dict,
                                              'year',
                                              'index')

valid['gdp'] = make_features_from_wbdf(valid,
                                       'country',
                                       GDP_wb_df,
                                       match_dict,
                                       'year',
                                       'index')

valid['perc_urban_population'] = make_features_from_wbdf(valid,
                                                         'country',
                                                         perc_urban_population_wb_df,
                                                         match_dict,
                                                         'year',
                                                         'index')

valid['population_density'] = make_features_from_wbdf(valid,
                                                      'country',
                                                      population_density_wb_df,
                                                      match_dict,
                                                      'year',
                                                      'index')

In [None]:
test['population'] = make_features_from_wbdf(test,
                                              'country',
                                              population_wb_df,
                                              match_dict,
                                              'year',
                                              'index')

test['gdp'] = make_features_from_wbdf(test,
                                       'country',
                                       GDP_wb_df,
                                       match_dict,
                                       'year',
                                       'index')

test['perc_urban_population'] = make_features_from_wbdf(test,
                                                         'country',
                                                         perc_urban_population_wb_df,
                                                         match_dict,
                                                         'year',
                                                         'index')

test['population_density'] = make_features_from_wbdf(test,
                                                      'country',
                                                      population_density_wb_df,
                                                      match_dict,
                                                      'year',
                                                      'index')

In [None]:
# make features subsets

X_train = train.drop([row_index,target_feature], axis=1)
y_train = train[target_feature]

X_valid = valid.drop([row_index,target_feature], axis=1)
y_valid = valid[target_feature]

X_test = test.drop([row_index], axis=1)

In [None]:
# encoding categorical features

X_train[cat_features_list] = OrdinalEncoder().fit_transform(X_train[cat_features_list])
X_valid[cat_features_list] = OrdinalEncoder().fit_transform(X_valid[cat_features_list])
X_test[cat_features_list] = OrdinalEncoder().fit_transform(X_test[cat_features_list])

In [None]:
# scaling

scaler = StandardScaler()

def scaling_data(data,numerical_features):
    scaler.fit(data[numerical_features])
    data[numerical_features] = scaler.transform(data[numerical_features])

In [None]:
# in this case catboost works better without scaling

#numerical_features = ['population', 'gdp']
#scaling_data(X_train,numerical_features)
#scaling_data(X_valid,numerical_features)
#scaling_data(X_test,numerical_features)

In [None]:
X_train['week'] = X_train['week'].astype('float64')
X_valid['week'] = X_valid['week'].astype('float64')
X_test['week'] = X_test['week'].astype('float64')

In [None]:
# simple CatBoostRegressor model

model = CatBoostRegressor(eval_metric='SMAPE',
                          use_best_model=True,
                          random_seed=123,
                          verbose = 200)

In [None]:
model.fit(X_train,y_train,eval_set=[(X_valid, y_valid)])

In [None]:
def plot_feature_importances(model,features_train):
    feature_names = list(features_train)
    importances = model.feature_importances_
    model_importances = pd.Series(importances, index=feature_names)
    model_importances = model_importances.sort_values(ascending=False).head(10)
    model_importances.plot.bar(figsize=(10,5))
    plt.xlabel('Features', fontsize=15)
    plt.ylabel('Importances',fontsize=15)
    plt.title(f'Top-10 features', fontsize=15)
    plt.grid()
    plt.show()

In [None]:
best_model = model

In [None]:
best_model.get_best_score()

In [None]:
# model parameters

best_model.get_all_params()

In [None]:
# feature importances

plot_feature_importances(best_model,X_valid)

<a id = '6.0'></a>
# 6. Predicting

In [None]:
model = best_model
predictions_array = model.predict(X_test)
submission = pd.DataFrame(data = zip(test_orig['row_id'], predictions_array), columns = ['row_id', 'num_sold'])

In [None]:
submission.to_csv('./submission.csv', index = False)