In [None]:
import os
import gc
import time
import math
import datetime
from math import log, floor
from sklearn.neighbors import KDTree

import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.utils import shuffle
from tqdm.notebook import tqdm as tqdm

import seaborn as sns
from matplotlib import colors
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize

import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

import warnings
warnings.filterwarnings("ignore")

# Utilities

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df
#
def autocorrelation(ys, t=1):
    return np.corrcoef(ys[:-t], ys[t:])

# Load the dataset

In [None]:
INPUT_DIR = '../input/m5-forecasting-uncertainty'
INPUT_DIR2 = '../input/walmartadd'
calendar = pd.read_csv(f'{INPUT_DIR}/calendar.csv')
selling_prices = pd.read_csv(f'{INPUT_DIR}/sell_prices.csv')
sample_submission = pd.read_csv(f'{INPUT_DIR}/sample_submission.csv')
sales = pd.read_csv(f'{INPUT_DIR2}/sales.csv')

In [None]:
sales.head()

In [None]:
series_cols = sales.columns[sales.columns.str.contains("d_")].values
level_cols = sales.columns[sales.columns.str.contains("d_")==False].values
days = np.arange(1, len(series_cols)+1)

In [None]:
timeseries = sales[series_cols].sum().values
len(timeseries)

## Train/Test available dates

In [None]:
plt.figure(figsize=(20,5))
plt.plot(days[0:1913], timeseries[0:1913], label="train")
plt.plot(days[1914:], timeseries[1914:], label="validation")
plt.title("Top-Level-1: Summed product sales of all stores and states");
plt.legend()
plt.xlabel("Day")
plt.ylabel("Unit sales");

## EDA - Hirerarchical levels

The dataset hierarchy consits of 12 levels:

- 1 - The top is given by the **unit sales of all products**, aggregated for all stores/states.
- 2 - Unit sales of all products, aggregated for each **state**.
- 3 - Unit sales of all products, aggregated for each **store**.
- 4 - Unit sales of all products, aggregated for each **category**.
- 5 - Unit sales of all products, aggregated for each **department**.

In [None]:
sns.set_palette("colorblind")

fig, ax = plt.subplots(5,1,figsize=(20,28))
sales[series_cols].sum().plot(ax=ax[0])
ax[0].set_title("Top-Level-1: Summed product sales of all stores and states")
ax[0].set_ylabel("Unit sales of all products");
sales.groupby("state_id")[series_cols].sum().transpose().plot(ax=ax[1])
ax[1].set_title("Level-2: Summed product sales of all stores per state");
ax[1].set_ylabel("Unit sales of all products");
sales.groupby("store_id")[series_cols].sum().transpose().plot(ax=ax[2])
ax[2].set_title("Level-3: Summed product sales per store")
ax[2].set_ylabel("Unit sales of all products");
sales.groupby("cat_id")[series_cols].sum().transpose().plot(ax=ax[3])
ax[3].set_title("Level-4: Summed product sales per category")
ax[3].set_ylabel("Unit sales of all products");
sales.groupby("dept_id")[series_cols].sum().transpose().plot(ax=ax[4])
ax[4].set_title("Level-4: Summed product sales per product department")
ax[4].set_ylabel("Unit sales of all products");


# Data Preprocesing

In [None]:
def preprocess_calendar(calendar):
    global maps, mods
    calendar["event_name"] = calendar["event_name_1"]
    calendar["event_type"] = calendar["event_type_1"]

    map1 = {mod:i for i,mod in enumerate(calendar['event_name'].unique())}
    calendar['event_name'] = calendar['event_name'].map(map1)
    map2 = {mod:i for i,mod in enumerate(calendar['event_type'].unique())}
    calendar['event_type'] = calendar['event_type'].map(map2)
    calendar['nday'] = calendar['date'].str[-2:].astype(int)
    maps["event_name"] = map1
    maps["event_type"] = map2
    mods["event_name"] = len(map1)
    mods["event_type"] = len(map2)
    calendar["wday"] -=1
    calendar["month"] -=1
    calendar["year"] -= 2011
    mods["month"] = 12
    mods["year"] = 6
    mods["wday"] = 7
    mods['snap_CA'] = 2
    mods['snap_TX'] = 2
    mods['snap_WI'] = 2

    calendar.drop(["event_name_1", "event_name_2", "event_type_1", "event_type_2", "date", "weekday"], 
                  axis=1, inplace=True)
    return calendar

In [None]:
def preprocess_sales(sales, start=1400, upper=1970):
    if start is not None:
        print("dropping...")
        to_drop = [f"d_{i+1}" for i in range(start-1)]
        print(sales.shape)
        sales.drop(to_drop, axis=1, inplace=True)
        print(sales.shape)
    #=======
    print("adding...")
    new_columns = ['d_%i'%i for i in range(1942, upper, 1)]
    for col in new_columns:
        sales[col] = np.nan
    print("melting...")
    sales = sales.melt(id_vars=["id", "item_id", "dept_id", "cat_id", "store_id", "state_id","scale","start"],
                        var_name='d', value_name='demand')

    print("generating order")
    if start is not None:
        skip = start
    else:
        skip = 1
    sales["nb"] =sales.index // 42840 + skip
    return sales

## Calendar dataset

In [None]:
maps = {}
mods = {}
calendar.head()

In [None]:
mod_calendar = preprocess_calendar(calendar)
mod_calendar.head()

## Sales dataset

In [None]:
START = 1400

In [None]:

mod_sales = preprocess_sales(sales, start=1400, upper= 1970)

In [None]:
mod_sales.head()

### Enconding of factors - Sales dataset

In [None]:
cols = ["item_id", "dept_id", "cat_id","store_id","state_id"]
for col in cols:
    temp_dct = {mod:i for i, mod in enumerate(mod_sales[col].unique())}
    mods[col] = len(temp_dct)
    maps[col] = temp_dct
for col in cols:
    mod_sales[col] = mod_sales[col].map(maps[col])
        

In [None]:
del temp_dct
mod_sales.head()

### Add calendar days to sales dataset

In [None]:
print("merge with calendar...")
mod_sales = mod_sales.merge(calendar, on='d', how='left')
del calendar

In [None]:
mod_sales = reduce_mem_usage(mod_sales)

In [None]:
mod_sales.head()

In [None]:
print("reordering...")
mod_sales.sort_values(by=["id","nb"], inplace=True)
print("re-indexing..")
mod_sales.reset_index(inplace=True, drop=True)
gc.collect()

In [None]:
mod_sales['n_week'] = (mod_sales['nb']-1)//7
mod_sales["nday"] -= 1
mods['nday'] = 31

In [None]:
mod_sales["x"] = mod_sales["demand"] / mod_sales["scale"]

In [None]:
mod_sales.head()

## Generate Sale lags

In [None]:
LAGS = [28, 35, 42, 49, 56, 63]
FEATS = []
for lag in tqdm(LAGS):
    mod_sales[f"x_{lag}"] = mod_sales.groupby("id")["x"].shift(lag)
    FEATS.append(f"x_{lag}")

In [None]:
print(mod_sales.shape)
mod_sales = mod_sales.loc[mod_sales.nb>mod_sales.start]
print(mod_sales.shape)

In [None]:
nb = mod_sales['nb'].values
MAX_LAG = max(LAGS)
#tr_mask = np.logical_and(nb>START + MAX_LAG, nb<=1913)
tr_mask = np.logical_and(nb>START + MAX_LAG, nb<=1941) # SORRY THIS IS FAKE VALIDATION. I DIDN'T THINK IT WOULD HAVE HAD LIFTED UP MY SCORE LIKE THAT
val_mask = np.logical_and(nb>1913, nb<=1941)
te_mask = np.logical_and(nb>1941, nb<=1969)

In [None]:
scale = mod_sales['scale'].values
ids = mod_sales['id'].values
#y = sales['demand'].values
#ys = y / scale
ys = mod_sales['x'].values
Z = mod_sales[FEATS].values

In [None]:
sv = scale[val_mask]
se = scale[te_mask]
ids = ids[te_mask]
ids = ids.reshape((-1, 28))

In [None]:
ca = mod_sales[['snap_CA']].values
tx = mod_sales[['snap_TX']].values
wi = mod_sales[['snap_WI']].values
wday = mod_sales[['wday']].values
month = mod_sales[['month']].values
year = mod_sales[['year']].values
event = mod_sales[['event_name']].values
nday = mod_sales[['nday']].values

In [None]:
item = mod_sales[['item_id']].values
dept = mod_sales[['dept_id']].values
cat = mod_sales[['cat_id']].values
store = mod_sales[['store_id']].values
state = mod_sales[['state_id']].values

In [None]:
def make_data(mask):
    x = {"snap_CA":ca[mask], "snap_TX":tx[mask], "snap_WI":wi[mask], "wday":wday[mask], 
         "month":month[mask], "year":year[mask], "event":event[mask], "nday":nday[mask], 
         "item":item[mask], "dept":dept[mask], "cat":cat[mask], "store":store[mask], 
         "state":state[mask], "num":Z[mask]}
    t = ys[mask]
    return x, t

In [None]:
xt, yt = make_data(tr_mask) #train
xv, yv = make_data(val_mask) # val
xe, ye = make_data(te_mask) # test

In [None]:
datasets = {'xt':xt, 'yt': yt, 'xv': xv, 'yv': yv, 'xe': xe, 'ye': ye}

In [None]:
import pickle

for filename, data in datasets.items():
    with open(str(filename) + '.pickle', 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
import tensorflow.keras.layers as L
import tensorflow.keras.models as M
import tensorflow.keras.backend as K
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
import tensorflow as tf

In [None]:
#=====
def qloss(y_true, y_pred):
    # Pinball loss for multiple quantiles
    qs = [0.005, 0.025, 0.165, 0.250, 0.500, 0.750, 0.835, 0.975, 0.995]
    q = tf.constant(np.array([qs]), dtype=tf.float32)
    e = y_true - y_pred
    v = tf.maximum(q*e, (q-1)*e)
    return K.mean(v)


#============================#
def make_model(n_in):
    
    num = L.Input((n_in,), name="num")
    
    ca = L.Input((1,), name="snap_CA")
    tx = L.Input((1,), name="snap_TX")
    wi = L.Input((1,), name="snap_WI")
    wday = L.Input((1,), name="wday")
    month = L.Input((1,), name="month")
    year = L.Input((1,), name="year")
    event = L.Input((1,), name="event")
    nday = L.Input((1,), name="nday")
    item = L.Input((1,), name="item")
    dept = L.Input((1,), name="dept")
    cat = L.Input((1,), name="cat")
    store = L.Input((1,), name="store")
    state = L.Input((1,), name="state")
    inp = {"snap_CA":ca, "snap_TX":tx, "snap_WI":wi, "wday":wday, 
           "month":month, "year":year, "event":event, "nday":nday,
           "item":item, "dept":dept, "cat":cat, "store":store, 
           "state":state, "num":num} 
    #
    ca_ = L.Embedding(mods["snap_CA"], mods["snap_CA"], name="ca_3d")(ca)
    tx_ = L.Embedding(mods["snap_TX"], mods["snap_TX"], name="tx_3d")(tx)
    wi_ = L.Embedding(mods["snap_WI"], mods["snap_WI"], name="wi_3d")(wi)
    wday_ = L.Embedding(mods["wday"], mods["wday"], name="wday_3d")(wday)
    month_ = L.Embedding(mods["month"], mods["month"], name="month_3d")(month)
    year_ = L.Embedding(mods["year"], mods["year"], name="year_3d")(year)
    event_ = L.Embedding(mods["event_name"], mods["event_name"], name="event_3d")(event)
    nday_ = L.Embedding(mods["nday"], mods["nday"], name="nday_3d")(nday)
    item_ = L.Embedding(mods["item_id"], 10, name="item_3d")(item)
    dept_ = L.Embedding(mods["dept_id"], mods["dept_id"], name="dept_3d")(dept)
    cat_ = L.Embedding(mods["cat_id"], mods["cat_id"], name="cat_3d")(cat)
    store_ = L.Embedding(mods["store_id"], mods["store_id"], name="store_3d")(store)
    state_ = L.Embedding(mods["state_id"], mods["state_id"], name="state_3d")(state)
    
    p = [ca_, tx_, wi_, wday_, month_, year_, event_, nday_, item_, dept_, cat_, store_, state_]
    emb = L.Concatenate(name="embds")(p)
    context = L.Flatten(name="context")(emb)
    
    x = L.Concatenate(name="x1")([context, num])
    x = L.Dense(500, activation="relu", name="d1")(x)
    x = L.Dropout(0.3)(x)
    x = L.Concatenate(name="m1")([x, context])
    x = L.Dense(500, activation="relu", name="d2")(x)
    x = L.Dropout(0.3)(x)
    x = L.Concatenate(name="m2")([x, context])
    x = L.Dense(500, activation="relu", name="d3")(x)
    preds = L.Dense(9, activation="linear", name="preds")(x)
    model = M.Model(inp, preds, name="M1")
    model.compile(loss=qloss, optimizer="adam")
    return model

In [None]:
net = make_model(len(FEATS))
ckpt = ModelCheckpoint("w.h5", monitor='val_loss', verbose=1, save_best_only=True, mode='min')
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                              patience=5, min_lr=0.001)
es = EarlyStopping(monitor='val_loss', patience=3)
print(net.summary())

In [None]:
net.fit(xt, yt, batch_size=50_000, epochs=20, validation_data=(xv, yv), callbacks=[ckpt, reduce_lr, es])

In [None]:
pv = net.predict(xv, batch_size=50_000, verbose=1)
pe = net.predict(xe, batch_size=50_000, verbose=1)

In [None]:
net.evaluate(xv, yv, batch_size=50_000)

In [None]:
pv = pv.reshape((-1, 28, 9))
pe = pe.reshape((-1, 28, 9))

In [None]:
sv = sv.reshape((-1, 28))
se = se.reshape((-1, 28))

In [None]:
Yv = yv.reshape((-1, 28))

In [None]:
k = np.random.randint(0, 42840)
#k = np.random.randint(0, 200)
print(ids[k, 0])
plt.plot(np.arange(28, 56), Yv[k], label="true")
plt.plot(np.arange(28, 56), pv[k ,:, 3], label="q25")
plt.plot(np.arange(28, 56), pv[k ,:, 4], label="q50")
plt.plot(np.arange(28, 56), pv[k, :, 5], label="q75")
plt.legend(loc="best")
plt.show()

In [None]:
names = [f"F{i+1}" for i in range(28)]

In [None]:
piv = pd.DataFrame(ids[:, 0], columns=["id"])

In [None]:
QUANTILES = ["0.005", "0.025", "0.165", "0.250", "0.500", "0.750", "0.835", "0.975", "0.995"]
VALID = []
EVAL = []

for i, quantile in tqdm(enumerate(QUANTILES)):
    t1 = pd.DataFrame(pv[:,:, i]*sv, columns=names)
    t1 = piv.join(t1)
    t1["id"] = t1["id"] + f"_{quantile}_validation"
    t2 = pd.DataFrame(pe[:,:, i]*se, columns=names)
    t2 = piv.join(t2)
    t2["id"] = t2["id"] + f"_{quantile}_evaluation"
    VALID.append(t1)
    EVAL.append(t2)

In [None]:
sub = pd.DataFrame()
sub = sub.append(VALID + EVAL)
del VALID, EVAL, t1, t2

In [None]:
sub.head()

In [None]:
sub.to_csv("submission.csv", index=False)