# Kaggle Demand Forecasting with Fast.ai

See [competition details](https://www.kaggle.com/c/demand-forecasting-kernels-only)

This is largely based on the lesson3 notebook for the Rossman forecasting challenge.

In [None]:

%matplotlib  inline
%reload_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np

from fastai.structured import *
from fastai.column_data import *
np.set_printoptions(threshold=50, edgeitems=20)

PATH_WRITE = "/kaggle/working/"


# Load Data

In [None]:

train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
ssub = pd.read_csv('../input/sample_submission.csv')

print(f'train: {train.shape}', f'test {test.shape}')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
for col in ['store', 'item']:
    train[col] = train[col].astype('category')
    test[col] = test[col].astype('category')
    
train.describe(include='all')

In [None]:
train.isnull().sum()

# Feature Engineering

In [None]:
train2 = train.copy()
test2 = test.copy()

add_datepart(train2, "date", drop=False)
add_datepart(test2, "date", drop=False)
train2.head()

In [None]:
test2.head()

In [None]:
cat_vars = list(train2)
[cat_vars.remove(col) for col in ['sales', 'Elapsed', 'date']]
for v in cat_vars: train2[v] = train2[v].astype('category').cat.as_ordered()
apply_cats(test2, train2)

In [None]:
for v in ['sales', 'Elapsed']:
    train2[v] = train2[v].fillna(0).astype('float32')
    if v in test2:
        test2[v] = test2[v].fillna(0).astype('float32')

In [None]:
train2 = train2.set_index('date')
test2 = test2.set_index('date')

df, y, nas, mapper = proc_df(train2, 'sales', do_scale=True)
yl = np.log(y+1)

In [None]:
test2['sales'] = 0
df_test, _, nas, mapper = proc_df(test2, 'sales', do_scale=True, skip_flds=['id'], mapper=mapper, na_dict=nas)

In [None]:
df_test.info()

In [None]:
df.info()

Time-based validation, as that's the goal with the test set.

In [None]:
val_idx = np.flatnonzero((df.index<datetime.datetime(2018,1,1)) & (df.index>=datetime.datetime(2017,10,1)))

# Model

First we need to ensure our target metric matches the competition

In [None]:
def inv_y(a): return np.exp(a) - 1

def smape(y_pred, targ):
    targ = inv_y(targ)
    pred = inv_y(y_pred)
    ape = 2 * np.abs(pred - targ) / (np.abs(pred) + np.abs(targ))
    return ape.mean() 

max_log_y = np.max(yl)
y_range = (0, max_log_y*1.2)

In [None]:
class _ColumnarModelData(ColumnarModelData):
    @classmethod
    def from_data_frames(cls, path, trn_df, val_df, trn_y, val_y, cat_flds, bs, is_reg, test_df=None):
        test_ds = ColumnarDataset.from_data_frame(test_df, cat_flds, None, is_reg) if test_df is not None else None
        return cls(path, ColumnarDataset.from_data_frame(trn_df, cat_flds, trn_y, is_reg),
                    ColumnarDataset.from_data_frame(val_df, cat_flds, val_y, is_reg), bs, test_ds=test_ds)


md = _ColumnarModelData.from_data_frame('.', val_idx, df, yl.astype(np.float32), cat_flds=cat_vars, bs=128, test_df=df_test)

Determine embedding levels for categorical variables

In [None]:
cat_sz = [(c, len(train2[c].cat.categories)+1) for c in cat_vars]
cat_sz

In [None]:
emb_szs = [(c, min(50, (c+1)//2)) for _,c in cat_sz]
emb_szs

In [None]:

m = md.get_learner(emb_szs, len(df.columns)-len(cat_vars),
                   0.04, 1, [1000,500], [0.001,0.01], y_range=y_range, 
                   tmp_name=f"{PATH_WRITE}tmp", models_name=f"{PATH_WRITE}models")

In [None]:
lr = 1e-3
m.lr_find()

In [None]:
m.sched.plot(100)

In [None]:
m.fit(lr, 3, metrics=[smape])

In [None]:
m.save('val0')

In [None]:
m.load('val0')

In [None]:
x,y=m.predict_with_targs()

In [None]:
smape(x, y)

In [None]:
pred_test=m.predict(True)

In [None]:
pred_test=np.exp(pred_test) - 1

In [None]:
test2['sales'] = pred_test

In [None]:
test2[['id','sales']].to_csv('predictions0.csv', index=False)