In [None]:
import os
import time

import math
import numpy as np 
import pandas as pd
import seaborn as sns; sns.set(style="ticks", color_codes=True)

from sklearn.metrics import mean_absolute_error as MAE, mean_squared_error as MSE
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_selection import RFE

import matplotlib.pyplot as plt
from IPython.display import display

In [None]:
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 256)

# **Data Loading**

In [None]:
data_root = '../input/walmart-recruiting-store-sales-forecasting'
datasets = dict()
for ds in ['train', 'test']:
    dataset = pd.read_csv(f"{data_root}/{ds}.csv.zip", sep=',', header=0,
                          names=['Store', 'Dept', 'Date', 'weeklySales', 'isHoliday'] if ds=='train'
                           else ['Store', 'Dept', 'Date', 'isHoliday'])
    features = pd.read_csv(f"{data_root}/features.csv.zip", sep=',', header=0,
                           names=['Store', 'Date', 'Temperature', 'Fuel_Price', 
                                  'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 
                                  'CPI', 'Unemployment', 'IsHoliday']).drop(columns=['IsHoliday'])
    stores = pd.read_csv(f"{data_root}/stores.csv", names=['Store', 'Type', 'Size'], sep=',', header=0)
    dataset = dataset.merge(stores, how='left').merge(features, how='left')

    dataset['Date'] = pd.to_datetime(dataset['Date'])
    # dataset["isTomorrowHoliday"] = dataset["isHoliday"].shift(-1).fillna(False)
    display(dataset.head())
    
    datasets[ds] = dataset

In [None]:
datasets['train'][datasets['train'].weeklySales<=0]

In [None]:
datasets['train'].dtypes

In [None]:
def describe_missing_values(df: pd.DataFrame):
    miss_val = df.isnull().sum()
    miss_val_percent = 100 * df.isnull().sum() / len(df)
    miss_val_table = pd.concat([miss_val, miss_val_percent], axis=1)
    miss_val_table_ren_columns = miss_val_table.rename(
        columns = {0: 'Missing Values', 
                   1: '% of Total Values',}
    )
    miss_val_table_ren_columns = miss_val_table_ren_columns[
        miss_val_table_ren_columns.iloc[:,1] != 0
    ].sort_values('% of Total Values', ascending=False).round(1)
    
    print(f"Dataframe has {df.shape[1]} columns,")
    print(f"\t\t {miss_val_table_ren_columns.shape[0]} columns that have missing values.")

    return miss_val_table_ren_columns


def visualize_distribution_of_missing_values(df: pd.DataFrame):
    df_nan_check = df.isna().sum().sort_values()
    df_nan_check = df_nan_check.to_dict()
    df_not_nan = []

    nan_cols = 0

    for key, value in df_nan_check.items():
        df_nan_check[key] = int(value/len(df)*100)
        if df_nan_check[key] >= 80:
            nan_cols += 1
        else:
            df_not_nan.append(key)

    # Visualize
    plt.figure(figsize=(9, 6))
    plt.suptitle('Distribution of Empty Values', fontsize=19)
    plt.bar(df_nan_check.keys(), df_nan_check.values())
    plt.xticks(rotation=69)
    plt.show()
    

for ds in ['train', 'test']:
    print(f'\n\n{ds}-set:')
    print(describe_missing_values(datasets[ds]))
    # visualize_distribution_of_missing_values(dataset)

# **Data Exploration**

In [None]:
def scatter(dataset, column):
    plt.figure()
    plt.scatter(dataset[column] , dataset['weeklySales'], alpha=0.169)
    plt.ylabel('weeklySales')
    plt.xlabel(column)

In [None]:
for col in ['Fuel_Price', 'Size', 'CPI', 'Type', 'isHoliday', 'Unemployment', 'Temperature', 'Store', 'Dept']:
    scatter(datasets['train'], col)

In [None]:
fig = plt.figure(figsize=(18, 14))
corr = datasets['train'].corr()
c = plt.pcolor(corr)
plt.yticks(np.arange(0.5, len(corr.index), 1), corr.index)
plt.xticks(np.arange(0.5, len(corr.columns), 1), corr.columns, rotation=45)
fig.colorbar(c)

# **Data Manipulation**

In [None]:
for ds in datasets.keys():
    # make holidays more specific
    datasets[ds]['Holiday_Type'] = None
    datasets[ds].loc[(datasets[ds]['isHoliday']==True) & 
                     (datasets[ds]['Date'].dt.month==2), 'Holiday_Type'] = 'Super_Bowl'
    datasets[ds].loc[(datasets[ds]['isHoliday']==True) & 
                     (datasets[ds]['Date'].dt.month==9), 'Holiday_Type'] = 'Labor_Day'
    datasets[ds].loc[(datasets[ds]['isHoliday']==True) & 
                     (datasets[ds]['Date'].dt.month==11), 'Holiday_Type'] = 'Thanksgiving' 
    datasets[ds].loc[(datasets[ds]['isHoliday']==True) & 
                     (datasets[ds]['Date'].dt.month==12), 'Holiday_Type'] = 'Christmax'
    datasets[ds].drop(columns=['isHoliday'], inplace=True)
    
    # 1-hot encoding for categorical features
    datasets[ds] = pd.get_dummies(datasets[ds], columns=["Type", "Holiday_Type"])
    
    # data imputation
    datasets[ds].fillna(value=0, inplace=True)
    display(datasets[ds].head())

In [None]:
for col in datasets['train'].columns:
    if col in ['Store', 'Dept', 'Date']:
        continue
    if col not in list(datasets['test'].columns):
        datasets['test'][col] = 0

datasets['train'].rename(columns={'weeklySales': 'Weekly_Sales'}, inplace=True)
datasets['train']['Weekly_Sales'][datasets['train']['Weekly_Sales']<0] = 0

feature_names = [col for col in datasets['train'] if col != 'Weekly_Sales']
datasets['train'] = datasets['train'][feature_names+['Weekly_Sales']]
datasets['test'] = datasets['test'][feature_names]
        
display(datasets['train'].head())
display(datasets['test'].head())

In [None]:
stats = datasets['train'].groupby(["Store", "Dept"])['Date'].agg(['count']).value_counts(sort=False)
stats = stats.to_frame('#(store, dept)')
display(stats)
stats.reset_index(drop=True, inplace=True)
plt.plot(stats.index, stats.values)

In [None]:
data_train, data_test = datasets['train'].copy(), datasets['test'].copy()

In [None]:
window_size = 12
stride = 1
X_all, y_all = [], []
for name, group in data_train.groupby(["Store", "Dept"]):
    data_group = group.sort_values(by=['Date'], ascending=True)
    data_group = data_group.drop(columns=["Store", "Dept", "Date"]).to_numpy()
    if data_group.shape[0] < window_size/3:
        continue

    # Padding
    n_samples = (len(data_group) - window_size) / stride
    if n_samples != int(n_samples):
        n_pads = (math.ceil(n_samples) - n_samples) * stride
        if abs(n_pads-round(n_pads)) > 1e-7:
            raise ValueError(f"n_pads={n_pads} must be INT")
        n_pads = int(n_pads)
        
        data_padded = np.zeros(shape=(len(data_group)+n_pads, data_group.shape[1]))
        data_padded[-data_group.shape[0]:, :] = data_group
        data_group = data_padded
    
    X_group = data_group[:, :-1]
    y_group = data_group[:, -1]

    n_samples = int((len(data_group) - window_size) / stride)
    for s in range(n_samples):
        s_start = s * stride
        X_sample = X_group[s_start:s_start+window_size, :]
        y_sample = y_group[s_start+window_size]
        X_all.append(X_sample.T)
        y_all.append(y_sample.T)
        
X_all, y_all = np.array(X_all), np.array(y_all)
print('Total samples:\t\t', X_all.shape, y_all.shape)

ML_max_samples = 70_000
if len(X_all) > ML_max_samples:
    X_ml, _, y_ml, _ = train_test_split(X_all, y_all, train_size=ML_max_samples, random_state=20_03_21, shuffle=True)
else:
    X_ml, y_ml = X_all, y_all
print('Samples for ML model:\t', X_ml.shape, y_ml.shape)

In [None]:
sales_stats = pd.DataFrame(y_all)
display(sales_stats.describe())
plt.scatter(sales_stats.index, sales_stats.values, alpha=0.169)

In [None]:
sales_stats = pd.DataFrame(np.log(np.where(y_all<1, 1, y_all)))
display(sales_stats.describe())
plt.scatter(sales_stats.index, sales_stats.values, alpha=0.169)

In [None]:
y_all = np.where(y_all<1, 1, y_all)
y_all = np.log(y_all)

In [None]:
data_test['Weekly_Sales'] = None
data_group_extended = dict()
N_features = len(datasets['test'].columns) - 3

for name, test_group in data_test.groupby(["Store", "Dept"]):
    
    train_group = data_train[(data_train.Store==name[0]) & (data_train.Dept==name[1])]

    ######################################################
    # Concatenate train-set & test-set per (store, dept) #
    ######################################################
    data_group = pd.concat([train_group, test_group]) if len(train_group) > 0 else test_group
    data_group.sort_values(by=['Date'], ascending=True, inplace=True)
    # print(data_group[['Date', 'Size', 'Temperature', 'Weekly_Sales']].to_string())
    # if data_group.duplicated(subset=['Store', 'Dept', 'Date'], keep=False).sum() > 0:
    #     print(name)
    data_group.reset_index(drop=True, inplace=True)
    # display(data_group[data_group.Weekly_Sales.isna()])

    ######################################
    # Build batch samples for prediction #
    ######################################
    predict_indices = list(data_group[data_group.Weekly_Sales.isna()].index)
    if len(predict_indices) == 0:
        continue
    features = data_group.to_numpy()[:, 3:-1]

    # Padding
    if len(train_group) < window_size:
        n_pads = window_size - len(train_group)
        features = np.vstack([np.zeros(shape=(n_pads, N_features)), features])
        predict_indices = [i+n_pads for i in predict_indices]

    # Sample by batch
    X_predict = np.empty(shape=(len(predict_indices), N_features, window_size))
    for j, idx in enumerate(predict_indices):
        X_predict[j, ...] = features[idx-window_size:idx, :].T

    data_group_extended[name] = [data_group[["Store", "Dept", "Date", "Weekly_Sales"]][data_group.Weekly_Sales.isna()], 
                                 predict_indices, 
                                 X_predict]

In [None]:
# Check size of predictions per (store, dept)
for name, (data_group, indices, X_group) in data_group_extended.items():
    sum_1 = data_group.Weekly_Sales.isna().sum()
    sum_2 = len(indices)
    sum_3 = X_group.shape[0]
    if (sum_1 != sum_2) or (sum_2 != sum_3):
        print(name, data_group.Weekly_Sales.isna().sum(), len(indices), X_group.shape)

# **Modeling**

## **Input Shape**: (N_samples, N_features, Max_seq_len)

In [None]:
!pip install --ignore-installed tsai

In [None]:
from tsai.all import *

import torch

def torch2np(tensor: torch.Tensor) -> np.array:
    if torch.cuda.is_available():
        tensor = tensor.cpu()
    return tensor.numpy()

In [None]:
scorer = make_scorer(MSE, greater_is_better=False)

## MiniRocket

In [None]:
# Machine-Learning models
ML_models = [
    # (RocketClassifier, {'num_kernels': 10_000}),
    (MiniRocketClassifier, {'num_features': 10_000, 'max_dilations_per_kernel': 32}),
    (MiniRocketVotingClassifier, {'num_features': 10_000, 'max_dilations_per_kernel': 32, 'n_estimators': 3}),
]

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_ml, y_ml, 
                                                  train_size=0.69 if len(X_ml) < ML_max_samples else ML_max_samples//2, 
                                                  random_state=4_10_20, 
                                                  shuffle=True)
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)

In [None]:
# model = MiniRocketRegressor(num_features=10_000, 
#                             max_dilations_per_kernel=window_size,
#                             normalize_features=False,
#                             verbose=True,
#                             scoring=scorer)

# print("Training MiniRocket ...")
# timer.start(False)
# model.fit(X_train, y_train)
# t = timer.stop()
# print(f"\t ... in {t}")

In [None]:
# y_pred = model.predict(X_val)
# error = MSE(y_val, y_pred, squared=False) # Root-MSE
# print(f'Val Error: {error:.5f}')

In [None]:
# results = []
# for name, (data_group, indices, X_test) in data_group_extended.items():
#     if len(data_group) != X_test.shape[0]:
#         raise ValueError(f"{name} - {len(data_group)} != {X_test.shape[0]}")
#     else:
#         y_pred = model.predict(X_test)
#         data_group.Weekly_Sales = y_pred
#         data_group['Date'] = pd.to_datetime(data_group.Date, format='%Y-%m-%d %H:%M:%S')
#         data_group['Id'] = data_group['Store'].astype(int).apply(str) + '_' \
#                           + data_group['Dept'].astype(int).apply(str) + '_' \
#                           + data_group['Date'].dt.strftime('%Y-%m-%d')
#         results.append(data_group[['Id', 'Weekly_Sales']])
        
# results = pd.concat(results)
# results.Weekly_Sales = results.Weekly_Sales.apply(np.exp)
# results.to_csv('submission_MiniRocket.csv', index=False)

In [None]:
# if not os.path.isdir('models'):
#     os.mkdir('models')
# model.save('MiniRocket')

## Deep Learning

In [None]:
# Deep-Learning models
DL_models = {
    "InceptionTime": (InceptionTime, {'nf': 32, 'ks': window_size}), 
    "InceptionTimePlus": (InceptionTimePlus, {'nf': 32, 'ks': window_size, 'bottleneck': True, 'depth': 4, 'dilation': 1, 'stride': 1}), 
    "TSTransformer": (TST, {'max_seq_len': window_size*2, 'd_model': 64, 'd_ff': 32, 'n_layers': 2, 'n_heads': 8, }), 
    "TSTransformerPlus": (TSTPlus, {'max_seq_len': window_size*2, 'd_model': 64, 'd_ff': 32, 'n_layers': 2, 'n_heads': 8, }), 
}

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_all, y_all, 
                                                  train_size=0.69, 
                                                  random_state=4_10_20, 
                                                  shuffle=True)
X_dl, y_dl, splits = combine_split_data([X_train, X_val], [y_train, y_val])

In [None]:
transformations = [None, [TSRegression()]]
batch_transformations = [TSStandardize(by_sample=True, by_var=True)]
dsets = TSDatasets(X_dl, y_dl, splits=splits, tfms=transformations, inplace=True)
dloaders = TSDataLoaders.from_dsets(dsets.train, dsets.valid, bs=[64, 32], batch_tfms=batch_transformations, num_workers=0)

In [None]:
MODEL_NAME = 'InceptionTime'
model, model_params = DL_models[MODEL_NAME]
model = create_model(model, dls=dloaders, **model_params)
learner = Learner(dls=dloaders, model=model, metrics=[mae, rmse], opt_func=Adam)

In [None]:
lr_min, lr_steepest = learner.lr_find(start_lr=1e-7, end_lr=1e0, num_it=1_000)
lr_min, lr_steepest

In [None]:
learner.fit_one_cycle(n_epoch=150, lr_max=lr_min)

In [None]:
# valid_probas, valid_targets, valid_preds = learner.get_preds(dl=dloaders.valid, with_decoded=True)
# acc = torch2np((valid_targets==valid_preds).float().mean())

In [None]:
model.parameters

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
results = []
for name, (data_group, indices, X_test) in data_group_extended.items():
    if len(data_group) != X_test.shape[0]:
        raise ValueError(f"{name} - {len(data_group)} != {X_test.shape[0]}")
    else:
        X_test = torch.Tensor(X_test).to(device)
        print(X_test.shape)
        y_pred = model(X_test)
        y_pred = torch2np(y_pred.detach())
        data_group.Weekly_Sales = y_pred
        data_group['Date'] = pd.to_datetime(data_group.Date, format='%Y-%m-%d %H:%M:%S')
        data_group['Id'] = data_group['Store'].astype(int).apply(str) + '_' \
                          + data_group['Dept'].astype(int).apply(str) + '_' \
                          + data_group['Date'].dt.strftime('%Y-%m-%d')
        results.append(data_group[['Id', 'Weekly_Sales']])
        
results = pd.concat(results)
results.Weekly_Sales = results.Weekly_Sales.apply(np.exp)
results.to_csv(f'submission_{MODEL_NAME}.csv', index=False)