In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. Libs and Datasets

In [None]:
# Loading libs

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Set palette

sns.set_palette('colorblind')

In [None]:
# File path

sample = '/kaggle/input/tabular-playground-series-jan-2022/sample_submission.csv'
file_train = '/kaggle/input/tabular-playground-series-jan-2022/train.csv'
file_test = '/kaggle/input/tabular-playground-series-jan-2022/test.csv'

In [None]:
# Load files

df_sample = pd.read_csv(sample)
df_train = pd.read_csv(file_train)
df_test = pd.read_csv(file_test)

df_sample.head(3)

In [None]:
df_test.head(3)

In [None]:
# For each day, 18 observations = 3 countries * 2 stores * 3 products

df_train.head(20)

# 2. Identify predictors and target

In [None]:
# What to predict?

target = 'num_sold'

In [None]:
df_train.shape

In [None]:
# 4 years in train set and 1 year in test set

print( (df_train.shape[0] / 18) / (365*3 + 366*1) )
print( (df_test.shape[0] / 18) / 365 )

In [None]:
df_train.info()

In [None]:
# Convert str to datetime for 'date' column

df_train['date'] = pd.to_datetime(df_train['date'])

In [None]:
df_train.info()

In [None]:
df_train.describe().T

In [None]:
df_train.isnull().sum()

In [None]:
countries = list(df_train['country'].unique())
stores = list(df_train['store'].unique())
products = list(df_train['product'].unique())

In [None]:
print(countries)
print(stores)
print(products)

# 3. Visualization

In [None]:
# Seperate the sales by country, store, and product (with the help of 'hue' in seaborn)

fig, ax = plt.subplots(3, 2, figsize=(30, 20))

for i, country in enumerate(countries):
    for j, store in enumerate(stores):
        sns.lineplot(data=df_train[(df_train['country']==country) & (df_train['store']==store)],
                     x='date', y=target, hue='product', ax=ax[i, j])
        ax[i, j].set_title(f'{country} - {store}', fontsize=15)
        # The y axis should be identical
        ax[i, j].set_ylim([0, 3000])

plt.show()

# 4. More features

In [None]:
# Create some features for date

# https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#time-date-components

df_train['year'] = df_train.date.dt.year
df_train['quarter'] = df_train.date.dt.quarter
df_train['month'] = df_train.date.dt.month
df_train['day'] = df_train.date.dt.day
df_train['dayofweek'] = df_train.date.dt.dayofweek
df_train['dayofyear'] = df_train.date.dt.dayofyear
df_train['weekofyear'] = df_train.date.dt.weekofyear

df_train.head(1)

In [None]:
date_features = list(df_train.columns)[-7:]

In [None]:
# Create dummy variables

dummy_c = pd.get_dummies(df_train['country'], prefix='c_', drop_first=True)
dummy_s = pd.get_dummies(df_train['store'], prefix='s_', drop_first=True)
dummy_p = pd.get_dummies(df_train['product'], prefix='p_', drop_first=True)

In [None]:
# Concatenate data

X_all = pd.concat([df_train[date_features], dummy_c, dummy_s, dummy_p], axis=1)
y_all = df_train[target]

In [None]:
# Validate the data

assert(len(X_all) == len(y_all))
assert(list(X_all.index.values) == list(y_all.index.values))

# 5. Models

In [None]:
# Train test ('valid' in this case) split

from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X_all, y_all, test_size=0.25, random_state=42)

In [None]:
# Linear regression as the benchmark for later comparisons

from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_valid, y_valid)

In [None]:
y_pred = lr.predict(X_valid)
plt.scatter(np.arange(len(y_pred)), y_pred, alpha=0.3, label='Prediction')
plt.scatter(np.arange(len(y_pred)), y_valid, alpha=0.3, label='True')
plt.legend(bbox_to_anchor=(1, 1))
plt.show()

In [None]:
# CatBoost - a gradient boosting machine
# For me, it is still a black box, which requires further digging.

from catboost import CatBoostRegressor

cb = CatBoostRegressor(random_state=42)
cb.fit(X_train, y_train, verbose=0)
cb.score(X_valid, y_valid)

In [None]:
y_pred = cb.predict(X_valid)
plt.scatter(np.arange(len(y_pred)), y_pred, alpha=0.3, label='Prediction')
plt.scatter(np.arange(len(y_pred)), y_valid, alpha=0.3, label='True')
plt.legend(bbox_to_anchor=(1, 1))
plt.show()

## We can see that linear regression is limited by model capacity,
## however catboost is able to capture these non-linear relationships and interaction effects.

***

In [None]:
# Fit the model with all training data

cb.fit(X_all, y_all, verbose=0)

***

# 6. Output

In [None]:
df_test

### **"Replay" the procedures**

In [None]:
# Convert str to datetime for 'date' column
df_test['date'] = pd.to_datetime(df_test['date'])

# Create some features for date

df_test['year'] = df_test.date.dt.year
df_test['quarter'] = df_test.date.dt.quarter
df_test['month'] = df_test.date.dt.month
df_test['day'] = df_test.date.dt.day
df_test['dayofweek'] = df_test.date.dt.dayofweek
df_test['dayofyear'] = df_test.date.dt.dayofyear
df_test['weekofyear'] = df_test.date.dt.weekofyear

# Create dummy variables
dummy_c = pd.get_dummies(df_test['country'], prefix='c_', drop_first=True)
dummy_s = pd.get_dummies(df_test['store'], prefix='s_', drop_first=True)
dummy_p = pd.get_dummies(df_test['product'], prefix='p_', drop_first=True)

# Concatenate data
X_test = pd.concat([df_test[date_features], dummy_c, dummy_s, dummy_p], axis=1)

In [None]:
# Validate the data

assert(X_test.shape[1]==X_all.shape[1])

In [None]:
df_test['num_sold'] = cb.predict(X_test)
# df_test['num_sold'] = df_test['num_sold']
df_test['num_sold'] = df_test['num_sold'].astype('int')

df_test

In [None]:
output = df_test[['row_id', 'num_sold']]
output

In [None]:
# Validate the data

assert(df_sample.shape == output.shape)

In [None]:
# Save the file for submission

output.to_csv('submission.csv', index=False)

***

# ***7. Findings and Thoughts***


## From this short and shallow analysis, we captured some trend and seasonal components in the given datasets.
## We can see that Kaggle Rama outperforms Kaggle Mart on average, especially in Norway!
## In order to increase revenue or visibility, some promotions and campaigns are needed between Autumn and Winter.
## In my opinion, I think sequence is very important for time-series data, so using some traditional time-series forecast models might be a good idea as well.
## I'll try another version using SARIMA, Holt-Winters, and NN (if possible).

***

## **BTW, this notebook skipped cross-validation, hyper-parameter optimization and essential metric, etc., which is A BIG PROBLEM!**