In [None]:
!pip3 install pmdarima

In [None]:
import datetime as dt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression

from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.arima_model import ARIMA
import pmdarima as pm

import re

pd.options.display.float_format = '{:.02f}'.format

# Loading data

In [None]:
prefix = '/kaggle/input/competitive-data-science-predict-future-sales/'
item_df = pd.read_csv(prefix + 'items.csv')
item_category_df = pd.read_csv(prefix + 'item_categories.csv')
shop_df = pd.read_csv(prefix + 'shops.csv')
train_df = pd.read_csv(prefix + 'sales_train.csv')
test_df = pd.read_csv(prefix + 'test.csv', index_col=0)

In [None]:
print(item_df.info())
item_df.head()

In [None]:
print(item_category_df.info())
item_category_df.head()

In [None]:
print(shop_df.info())
shop_df.head()

In [None]:
print(train_df.info())
train_df.head()

In [None]:
print(test_df.info())
test_df.head()

# Preprocessing

In [None]:
item_df = item_df.set_index('item_id')
item_category_df = item_category_df.set_index('item_category_id')
shop_df = shop_df.set_index('shop_id')

In [None]:
train_df = train_df.drop_duplicates().reset_index(drop=True)

Turn the `date` column into `datetime` type.

In [None]:
train_df['date'] = pd.to_datetime(train_df['date'], format='%d.%m.%Y')

In [None]:
train_df = train_df.join(item_df['item_category_id'], on='item_id')

In [None]:
train_df = train_df[train_df['item_price'] >= 0]
train_df = train_df[train_df['item_cnt_day'] >= 0]
train_df.shape

In [None]:
train_df.describe(include='datetime', datetime_is_numeric=True)

In [None]:
train_df['item_cnt_day'] = train_df['item_cnt_day'].clip(0, 20)

### Removing outliers

In [None]:
train_df = train_df[train_df['item_price'] < 300000]

# Building models

## Training/validation split

Splitting training and validation set.

In [None]:
train_set = train_df[train_df['date_block_num'] < 33].copy()
val_set = train_df[train_df['date_block_num'] == 33].copy()

Turn validation set into format the same as the test data.

In [None]:
grouped = val_set.groupby(['shop_id', 'item_id'])['item_cnt_day'].sum().reset_index()
X_val = grouped[['shop_id', 'item_id']]
y_val = grouped['item_cnt_day']

## Splitting validation set

Because there are items in the validation set that do not appear in the training set, predicting values for these items is challenging. There are no sale history recorded for these items, so they have to be predicted in a different way than items that do have sale history. So we need to split the validation set and predict the items using separate methods.

In [None]:
in_training_set = np.isin(X_val['item_id'], train_set['item_id'].unique())
(~in_training_set).sum()

In [None]:
X_val_in = X_val[in_training_set]
y_val_in = y_val[in_training_set]
X_val_out = X_val[~in_training_set]
y_val_out = y_val[~in_training_set]

## X_val_out

Items may not have sale history, but there categories can be determined.

In [None]:
def get_category_id(item_df, item_id):
    return int(item_df[item_df.index == item_id]['item_category_id'].iloc[0])

To predict sales for the "brand new" items, we will use information from other items in the training set that belong to the same category and in the same shop.

Let's take an example: we need to predict next month sales for sake wine in the alcohol shop A, but shop A never sells sake before. One approach is to use the sales of other types of wines which shop A have recorded. We can use the _average_ sales of all the wines of shop A in the previous month to predict the sale for sake wine next month. Or, in a more general way, we can use the average sales of wines in many previous months in the past to predict how sake will sell this month. Well since we're selling new stuff anyway, how else can we predict the performance of this new stuff without relying on other similar stuffs that have been around in the market before?

In [None]:
def get_categories_sale_history(train_set):
    average_sales_dict = dict()
    for category_id in item_category_df.index:
        tmp = train_set[train_set['item_category_id'] == category_id]
        tmp = tmp.groupby(['date_block_num', 'shop_id', 'item_id'])['item_cnt_day'].sum().reset_index()
        tmp = tmp.groupby(['date_block_num', 'shop_id'])['item_cnt_day'].mean().reset_index()
        pv = pd.pivot(tmp, index='date_block_num', columns='shop_id', values='item_cnt_day')\
               .to_dict(orient='series')
        for shop_id, history in pv.items():
            average_sales_dict.setdefault(shop_id, dict())
            history = history.fillna(0).reindex(range(33), fill_value=0)
            average_sales_dict[shop_id][category_id] = history
    return average_sales_dict

In [None]:
%%time
categories_sales_dict = get_categories_sale_history(train_df)

In [None]:
def predict_X_out(X_out, categories_sales_dict, predictor):
    preds = []
    for shop_id, item_id in X_out.values:
        category_id = get_category_id(item_df, item_id)
        sale_history = categories_sales_dict.get(shop_id, dict()).get(category_id, pd.Series([0.0]))
#         sale_history = pd.Series(sale_history)

        if type(sale_history) == float:
            print(shop_id, item_id)
        pred = predictor(sale_history)
        preds.append(pred)

    preds = pd.Series(preds)
    preds.index = X_out.index
    return preds

In [None]:
def strip_zeros(sale_history):
    # First index of nonzero value
    cut = np.argmax(sale_history != 0)
    return sale_history[cut:]

### Naive ways

In [None]:
def naive_predictor(sale_history):
#     print(sale_history.isna().sum())
    return sale_history.iloc[-1]

In [None]:
naive_out = predict_X_out(X_val_out, categories_sales_dict, naive_predictor)
print(f'RMSE = {np.sqrt(mean_squared_error(y_val_out.clip(0, 20), naive_out.clip(0, 20)))}')
sns.histplot(y_val_out.clip(0, 20) - naive_out.clip(0, 20));

### Linear regression

In [None]:
def linear_predictor(sale_history):
    if sale_history.isna().sum() / len(sale_history) > 0.3:
        return naive_predictor(sale_history)
    y = sale_history.fillna(sale_history.mean())
    model = LinearRegression().fit(np.arange(len(sale_history)).reshape(-1, 1), y)
    pred = model.predict(np.array([[len(sale_history) + 1]]))[0]
    return pred

In [None]:
%%time
linear_out = predict_X_out(X_val_out, categories_sales_dict, linear_predictor)
print(f'RMSE = {np.sqrt(mean_squared_error(y_val_out.clip(0, 20), linear_out.clip(0, 20)))}')
sns.histplot(y_val_out.clip(0, 20) - linear_out.clip(0, 20));

### Exponential Smoothing

In [None]:
def exponential_smoothing_predictor(sale_history):
    if (sale_history[-5:] == 0).sum() == 0 and (sale_history == 0).sum() / len(sale_history) < 0.2:
        pred = ExponentialSmoothing(strip_zeros(sale_history).values).fit().predict(33)[0]
    else:
        pred = naive_predictor(sale_history)
    return pred

In [None]:
exp_out = predict_X_out(X_val_out, categories_sales_dict, exponential_smoothing_predictor)
print(f'RMSE = {np.sqrt(mean_squared_error(y_val_out.clip(0, 20), exp_out.clip(0, 20)))}')
sns.histplot(y_val_out.clip(0, 20) - exp_out.clip(0, 20));

### SARIMA

In [None]:
def arima_predictor(sale_history, seasonal=False):
    y = strip_zeros(sale_history)
    if len(y) < 8:
        return naive_predictor(sale_history)
    model = pm.auto_arima(
        y,
        test='adf',
        max_p=4,
        max_q=4,
        seasonal=seasonal,
    )

    return model.predict(1)[0]

In [None]:
def sarima_predictor(sale_history):
    return arima_predictor(sale_history, seasonal=True)

In [None]:
idx = 13
shop, item = X_val_out.iloc[idx]
sale_history = categories_sales_dict.get(shop, dict()).get(get_category_id(item_df, item), 0.0)

sns.lineplot(data=sale_history, color='blue')
sns.scatterplot(x=[33], y=[y_val_out.iloc[idx]], color='blue')
sns.scatterplot(x=[33], y=[naive_predictor(sale_history)], color='orange')
sns.scatterplot(x=[33], y=[linear_predictor(sale_history)], color='green')
sns.scatterplot(x=[33], y=[exponential_smoothing_predictor(sale_history)], color='purple')
sns.scatterplot(x=[33], y=[sarima_predictor(sale_history)], color='gray')
sns.scatterplot(x=[33], y=[arima_predictor(sale_history)], color='red')

plt.legend(['Sale history', 'True value', 'Naive', 'Linear', 'Exp', 'SARIMA', 'ARIMA'])
plt.show()

## X_val_in

In [None]:
def get_item_month_sale_history(train_df, item_id, shop_id=None):
    if shop_id is None:
        data = train_df[(train_df['item_id'] == item_id)]
    else:
        data = train_df[(train_df['shop_id'] == shop_id) & (train_df['item_id'] == item_id)]
    data = data.groupby('date_block_num').agg({'item_cnt_day': 'sum'})
    return data.iloc[:, 0].reindex(range(33), fill_value=0)

In [None]:
def predict_X_in(train_set, X_in, predictor):
    y_pred_in = []
    for shop_id, item_id in X_in.values:
        sale_history = get_item_month_sale_history(train_set, item_id, shop_id)
#         print(shop_id, item_id)
        pred = predictor(sale_history)
        y_pred_in.append(pred)

    y_pred_in = pd.Series(y_pred_in)
    y_pred_in.index = X_in.index
    return y_pred_in

In [None]:
idx = 32
shop, item = X_val_in.iloc[idx]
sale_history = get_item_month_sale_history(train_set, item, shop)

sns.lineplot(data=sale_history, color='blue')
sns.scatterplot(x=[33], y=[y_val_in.iloc[idx]], color='blue')
sns.scatterplot(x=[33], y=[naive_predictor(sale_history)], color='orange')
sns.scatterplot(x=[33], y=[linear_predictor(sale_history)], color='green')
sns.scatterplot(x=[33], y=[exponential_smoothing_predictor(sale_history)], color='purple')
sns.scatterplot(x=[33], y=[sarima_predictor(sale_history)], color='gray')
sns.scatterplot(x=[33], y=[arima_predictor(sale_history)], color='red')

plt.legend(['Sale history', 'True value', 'Naive', 'Linear', 'Exp', 'SARIMA', 'ARIMA'])
plt.show()

## Combining prediction

In [None]:
def combine_prediction(index, pred_in, pred_out):
    result = pd.Series(0, index=index)
    result[pred_in.index] = pred_in
    result[pred_out.index] = pred_out
    return result

## Final prediction

In [None]:
X_train = train_df.copy()
X_test = test_df.copy()

In [None]:
in_train = X_test['item_id'].isin(X_train['item_id'].unique())
X_test_in = X_test[in_train]
X_test_out = X_test[~in_train]

In [None]:
%%time
categories_sale_history_dict = get_categories_sale_history(X_train)
pred_in = predict_X_in(X_train, X_test_in, exponential_smoothing_predictor)
pred_out = predict_X_out(X_test_out, categories_sale_history_dict, exponential_smoothing_predictor)
results = combine_prediction(X_test.index, pred_in, pred_out)

# Submitting

In [None]:
submission = pd.DataFrame(results, columns=['item_cnt_month'])
submission.index.name = 'ID'
submission.to_csv('submission.csv')

In [None]:
!sed -n 1,10p submission.csv