# Setup

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Metrics

In [None]:
# https://gist.github.com/bshishov/5dc237f59f019b26145648e2124ca1c9
def _error(actual: np.ndarray, predicted: np.ndarray):
    """ Simple error """
    return actual - predicted

def _naive_forecasting(actual: np.ndarray, seasonality: int = 1):
    """ Naive forecasting method which just repeats previous samples """
    return actual[:-seasonality]

def mse(actual: np.ndarray, predicted: np.ndarray):
    """ Mean Squared Error """
    return np.mean(np.square(_error(actual, predicted)))

def mae(actual: np.ndarray, predicted: np.ndarray):
    """ Mean Absolute Error """
    return np.mean(np.abs(_error(actual, predicted)))

def rmsse(actual: np.ndarray, predicted: np.ndarray, seasonality: int = 1):
    """ Root Mean Squared Scaled Error """
    q = np.abs(_error(actual, predicted)) / mae(actual[seasonality:], _naive_forecasting(actual, seasonality))
    return np.sqrt(np.mean(np.square(q)))

# https://gist.github.com/bshishov/5dc237f59f019b26145648e2124ca1c9#gistcomment-3593885
def rmsse(actual: np.ndarray, predicted: np.ndarray, seasonality: int = 1):
    """ Root Mean Squared Scaled Error """
    q = mse(actual, predicted) / mse(actual[seasonality:], _naive_forecasting(actual, seasonality))
    return np.sqrt(q)

# https://www.kaggle.com/chrisrichardmiles/m5-wrmsse-custom-objective-and-custom-metric
# oos_scale = 1/w_12_train.oos_level_12_scale
def oos_rmsse(preds, train_data): 
        actuals = train_data.get_label()
        diff = actuals - preds
        grad = -diff * oos_scale
        hess = np.ones_like(diff)
        return grad, hess
    
# The scaling factor, w_12_train.oos_level_12_scale, 
# is the same as described in the competition, except 
# I removed zeros that I believed were due to being 
# out of stock before calculation.

# https://stackoverflow.com/questions/65216794/importerror-when-importing-metric-from-sklearn
# ! conda install sklearn
# ! pip install scikit-learn==0.24
from sklearn.metrics import mean_squared_error #, mean_absolute_percentage_error
from sklearn.utils import check_array
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = check_array(y_true, y_pred)

    ## Note: does not handle mix 1d representation
    #if _is_1d(y_true): 
    #    y_true, y_pred = _check_1d_array(y_true, y_pred)

    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100



#Defining MAPE function
def MAPE(Y_actual,Y_Predicted):
    mape = np.mean(np.abs((Y_actual - Y_Predicted)/Y_actual))*100
    return mape

# https://www.kaggle.com/chameleontk/rmse-and-wrmsse-of-a-submission
def wrmsse(preds, y_true):
    # number of columns
    num_col = DAYS_PRED

    reshaped_preds = preds.reshape(num_col, NUM_ITEMS).T
    reshaped_true = y_true.reshape(num_col, NUM_ITEMS).T
    
          
    train = weight_mat_csr*np.c_[reshaped_preds, reshaped_true]
    
    score = np.sum(
                np.sqrt(
                    np.mean(
                        np.square(
                            train[:,:num_col] - train[:,num_col:])
                        ,axis=1) / weight1) * weight2)
    
    return score

# TODO: See if below is better?
# https://www.kaggle.com/sibmike/fast-clear-wrmsse-18ms?scriptVersionId=32323454



## Own local wrmsse evaluation/function

In [None]:
# https://www.kaggle.com/c/m5-forecasting-accuracy/discussion/143070

# https://www.kaggle.com/dhananjay3/wrmsse-evaluator-with-extra-features
from typing import Union

import numpy as np
import pandas as pd
from tqdm.auto import tqdm as tqdm

class WRMSSEEvaluator(object):
    
    group_ids = ( 'all_id', 'state_id', 'store_id', 'cat_id', 'dept_id', 'item_id',
        ['state_id', 'cat_id'],  ['state_id', 'dept_id'], ['store_id', 'cat_id'],
        ['store_id', 'dept_id'], ['item_id', 'state_id'], ['item_id', 'store_id'])

    def __init__(self, 
                 train_df: pd.DataFrame, 
                 valid_df: pd.DataFrame, 
                 calendar: pd.DataFrame, 
                 prices: pd.DataFrame):
        '''
        intialize and calculate weights
        '''
        self.calendar = calendar
        self.prices = prices
        self.train_df = train_df
        self.valid_df = valid_df
        self.train_target_columns = [i for i in self.train_df.columns if i.startswith('d_')]
        self.weight_columns = self.train_df.iloc[:, -28:].columns.tolist()

        self.train_df['all_id'] = "all"

        self.id_columns = [i for i in self.train_df.columns if not i.startswith('d_')]
        self.valid_target_columns = [i for i in self.valid_df.columns if i.startswith('d_')]

        if not all([c in self.valid_df.columns for c in self.id_columns]):
            self.valid_df = pd.concat([self.train_df[self.id_columns], self.valid_df],
                                      axis=1, 
                                      sort=False)
        self.train_series = self.trans_30490_to_42840(self.train_df, 
                                                      self.train_target_columns, 
                                                      self.group_ids)
        self.valid_series = self.trans_30490_to_42840(self.valid_df, 
                                                      self.valid_target_columns, 
                                                      self.group_ids)
        self.weights = self.get_weight_df()
        self.scale = self.get_scale()
        self.train_series = None
        self.train_df = None
        self.prices = None
        self.calendar = None

    def get_scale(self):
        '''
        scaling factor for each series ignoring starting zeros
        '''
        scales = []
        for i in tqdm(range(len(self.train_series))):
            series = self.train_series.iloc[i].values
            series = series[np.argmax(series!=0):]
            scale = ((series[1:] - series[:-1]) ** 2).mean()
            scales.append(scale)
        return np.array(scales)
    
    def get_name(self, i):
        '''
        convert a str or list of strings to unique string 
        used for naming each of 42840 series
        '''
        if type(i) == str or type(i) == int:
            return str(i)
        else:
            return "--".join(i)
    
    def get_weight_df(self) -> pd.DataFrame:
        """
        returns weights for each of 42840 series in a dataFrame
        """
        day_to_week = self.calendar.set_index("d")["wm_yr_wk"].to_dict()
        weight_df = self.train_df[["item_id", "store_id"] + self.weight_columns].set_index(
            ["item_id", "store_id"]
        )
        weight_df = (
            weight_df.stack().reset_index().rename(columns={"level_2": "d", 0: "value"})
        )
        weight_df["wm_yr_wk"] = weight_df["d"].map(day_to_week)
        weight_df = weight_df.merge(
            self.prices, how="left", on=["item_id", "store_id", "wm_yr_wk"]
        )
        weight_df["value"] = weight_df["value"] * weight_df["sell_price"]
        weight_df = weight_df.set_index(["item_id", "store_id", "d"]).unstack(level=2)[
            "value"
        ]
        weight_df = weight_df.loc[
            zip(self.train_df.item_id, self.train_df.store_id), :
        ].reset_index(drop=True)
        weight_df = pd.concat(
            [self.train_df[self.id_columns], weight_df], axis=1, sort=False
        )
        weights_map = {}
        for i, group_id in enumerate(tqdm(self.group_ids, leave=False)):
            lv_weight = weight_df.groupby(group_id)[self.weight_columns].sum().sum(axis=1)
            lv_weight = lv_weight / lv_weight.sum()
            for i in range(len(lv_weight)):
                weights_map[self.get_name(lv_weight.index[i])] = np.array(
                    [lv_weight.iloc[i]]
                )
        weights = pd.DataFrame(weights_map).T / len(self.group_ids)

        return weights

    def trans_30490_to_42840(self, df, cols, group_ids, dis=False):
        '''
        transform 30490 sries to all 42840 series
        '''
        series_map = {}
        for i, group_id in enumerate(tqdm(self.group_ids, leave=False, disable=dis)):
            tr = df.groupby(group_id)[cols].sum()
            for i in range(len(tr)):
                series_map[self.get_name(tr.index[i])] = tr.iloc[i].values
        return pd.DataFrame(series_map).T
    
    def get_rmsse(self, valid_preds) -> pd.Series:
        '''
        returns rmsse scores for all 42840 series
        '''
        score = ((self.valid_series - valid_preds) ** 2).mean(axis=1)
        rmsse = (score / self.scale).map(np.sqrt)
        return rmsse

    def score(self, valid_preds: Union[pd.DataFrame, np.ndarray]) -> float:
        assert self.valid_df[self.valid_target_columns].shape == valid_preds.shape

        if isinstance(valid_preds, np.ndarray):
            valid_preds = pd.DataFrame(valid_preds, columns=self.valid_target_columns)

        valid_preds = pd.concat([self.valid_df[self.id_columns], valid_preds],
                                axis=1, 
                                sort=False)
        valid_preds = self.trans_30490_to_42840(valid_preds, 
                                                self.valid_target_columns, 
                                                self.group_ids, 
                                                True)
        self.rmsse = self.get_rmsse(valid_preds)
        self.contributors = pd.concat([self.weights, self.rmsse], 
                                      axis=1, 
                                      sort=False).prod(axis=1)
        return np.sum(self.contributors)

# Kaggle API

In [None]:
# https://github.com/Kaggle/kaggle-api
! mkdir ~/.kaggle
! cp /kaggle/input/api-key/kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

# Functions

## Data functions

In [None]:
# https://www.kaggle.com/c/m5-forecasting-accuracy/discussion/133582
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

def get_sample(wide_df, n: int = 100):
    sample_df = wide_df.sample(n)
    print("sampling {} out of {} ({} %)".format(sample_df.shape[0], wide_df.shape[0], round(sample_df.shape[0]/wide_df.shape[0]*100,1)))
    return sample_df

def melt_m5(wide_df: pd.DataFrame, 
            id_cols: list = None, d_cols: list = None, 
            var_name: str = 'day', value_name: str = 'num_sales'):
    
    if id_cols is None:    # 
        print('Using default id_cols')
        id_cols = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
    if d_cols is None:
        print('Using default d_cols')
        d_cols = [d_col for d_col in wide_df.columns if d_col.startswith('d_')]
        
    df_melted = pd.melt(wide_df, id_vars=id_cols, value_vars=d_cols, var_name ='d', value_name ='num_sales')
    
    return df_melted

def join_on_calendar_price(df, df_calendar, df_prices):
    df_calendar = pd.merge(df, df_calendar, how='left', on='d')
    # Seems below merge is very memory hungry
    df_calendar_price = pd.merge(df_calendar, df_prices, how='left', on='d')
    return df_calendar_price

# https://datascience.stackexchange.com/questions/45550/merge-2-dataframe-with-memory-error

# Read data

We are predicting 28 forecast days (F1-F28) of items sold for each row. For the validation rows, this corresponds to d_1914 - d_1941, 
and for the evaluation rows, this corresponds to d_1942 - d_1969. (Note: a month before the competition close, the ground truth for the validation rows will be provided.)

sales_train_validation.csv - Contains the historical daily unit sales data per product and store [d_1 - d_1913]
sales_train_evaluation.csv - Includes sales [d_1 - d_1941]
 
Train [d_1 - d_1913] / validation [d_1914 - d_1941] (public?) | evaluation [d_1942 - d_1969] (private?)


In [None]:
validation_column_names = ['d_'+ str(i+1) for i in range(1913, 1941)] # public (we will be given this one month )
evaluation_column_names = ['d_'+ str(i+1) for i in range(1942, 1969)] # 
# in the submission file
forecast_column_names = ['F'+ str(i+1) for i in range(28)]

In [None]:
# Contains information about the dates on which the products are sold.
df_calendar = reduce_mem_usage(pd.read_csv('/kaggle/input/m5-forecasting-accuracy/calendar.csv'))
# The correct format for submissions. Reference the Evaluation tab for more info.
df_sample_submission = reduce_mem_usage(pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sample_submission.csv'))
# Contains informatio|n about the price of the products sold per store and date.
df_sell_prices = reduce_mem_usage(pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sell_prices.csv'))
# Contains the historical daily unit sales data per product and store [d_1 - d_1913]
df_sales_train_validation = reduce_mem_usage(pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sales_train_validation.csv'))
# Includes sales [d_1 - d_1941] (labels used for the Public leaderboard)
df_sales_train_evaluation = reduce_mem_usage(pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sales_train_evaluation.csv'))

In [None]:
df_sales_train_validation

In [None]:
df_sample_submission

## Transform

In [None]:
df_sales_train_evaluation_long = melt_m5(df_sales_train_evaluation)
df_sales_train_evaluation_long_cal_price = join_on_calendar_price(df_sales_train_evaluation_long, df_calendar, df_sell_prices)

# 0. Naive Forecast methods

Last day sales

In [None]:
last_day_1913_sales = df_sales_train_validation[['id', 'd_1913']].copy()
last_day_1913_sales.loc[:, 'id_join'] = last_day_1913_sales['id'].str.split('_').str[:-1].str.join('_')
last_day_1913_sales

In [None]:
print(df_sample_submission.shape)
df_submission_train_avg = df_sample_submission.copy()
# https://stackoverflow.com/questions/64886253/how-can-i-remove-string-after-last-underscore-in-python-dataframe
df_sales_train_validation.loc[:, 'id_join'] = df_sales_train_validation['id'].str.split('_').str[:-1].str.join('_')
df_submission_train_avg.loc[:, 'id_join'] = df_submission_train_avg['id'].str.split('_').str[:-1].str.join('_')
df_submission_train_avg = df_submission_train_avg.merge(last_day_1913_sales.drop(columns='id'),how='left', on='id_join')
print(df_submission_train_avg.shape)
df_submission_train_avg=df_submission_train_avg.drop(columns=forecast_column_names)
for forecast_column_name in forecast_column_names: 
    df_submission_train_avg[forecast_column_name] = list(df_submission_train_avg['d_1913'].values)
df_submission_train_avg.drop(columns=['id_join', 'd_1913']).to_csv("df_submission_train_avg.csv", index=False)

In [None]:
df_submission_train_avg

In [None]:
predicted = df_submission_train_avg.loc[30490:, forecast_column_names].values
actual = df_sales_train_evaluation[validation_column_names].values
print("mean_squared_error: ", mean_squared_error(actual, predicted, squared=False))
print("mean absolute error: ", mae(actual, predicted))
# print("mean_absolute_percentage_error: ", MAPE(actual, predicted))
print("Root Mean Squared Scaled Error: ", rmsse(actual, predicted))

In [None]:
# ! kaggle competitions submit -f df_submission_train_avg.csv -m df_submission_train_avg  m5-forecasting-accuracy

In [None]:
! kaggle competitions submissions m5-forecasting-accuracy

## 0.1 last 28 days

In [None]:
last_28_days = ['d_'+ str(i+1) for i in range(1885, 1913)]
print(len(last_28_days))

In [None]:
df_submission_last_28_days = df_sample_submission.copy()

df_submission_last_28_days[forecast_column_names] = np.concatenate((df_sales_train_validation[last_28_days].values, df_sales_train_validation[last_28_days].values))
df_submission_last_28_days

In [None]:
predicted = df_submission_last_28_days.loc[30490:, forecast_column_names].values
actual = df_sales_train_evaluation[validation_column_names].values
print("mean_squared_error: ", mean_squared_error(actual, predicted, squared=False))
print("mean absolute error: ", mae(actual, predicted))
# print("mean_absolute_percentage_error: ", mean_absolute_percentage_error(actual, predicted))
print("Root Mean Squared Scaled Error: ", rmsse(actual, predicted))

In [None]:
df_submission_last_28_days

In [None]:
df_submission_last_28_days.to_csv("submission_last_28_days.csv", index=False)
# ! kaggle competitions submit -f submission_last_28_days.csv -m df_submission_last_28_days  m5-forecasting-accuracy
! kaggle competitions submissions m5-forecasting-accuracy

# Google Auto ML

In [None]:
google_auto_ml_val_only = pd.read_csv("/kaggle/input/m5-forecaset-google-automl-results-v1/bq-results-20220208-161332-8exhswoe6c6m.csv")

In [None]:
df_sample_submission

In [None]:
google_auto_ml_val_only_wide = google_auto_ml_val_only.pivot(index='id', columns='date', values='predicted_num_sales')
google_auto_ml_val_only_wide.reset_index(inplace=True)
#google_auto_ml_val_only_wide.index.name = None
#del df.index.name
google_auto_ml_val_only_wide.columns = ['id'] + forecast_column_names
google_auto_ml_val_only_wide['id'] = google_auto_ml_val_only_wide['id'].str.replace('_evaluation','_validation')
google_auto_ml_val_only_wide

In [None]:
google_auto_ml_val_only_wide_submission = df_sample_submission.copy()
# Hopefully order of ids don't matter when submitting
google_auto_ml_val_only_wide_submission[0:len(google_auto_ml_val_only_wide)] = google_auto_ml_val_only_wide
google_auto_ml_val_only_wide_submission

In [None]:
google_auto_ml_val_only_wide_submission.to_csv("google_auto_ml_val_only_wide_submission.csv", index=False)
# ! kaggle competitions submit -f google_auto_ml_val_only_wide_submission.csv -m google_auto_ml_val_only_wide_submission  m5-forecasting-accuracy


In [None]:
! kaggle competitions submissions m5-forecasting-accuracy

In [None]:
predicted = df_submission_last_28_days.loc[30490:, forecast_column_names].values
actual = df_sales_train_evaluation[validation_column_names].values
print("mean_squared_error: ", mean_squared_error(actual, predicted, squared=False))
print("mean absolute error: ", mae(actual, predicted))
# print("mean_absolute_percentage_error: ", mean_absolute_percentage_error(actual, predicted))
print("Root Mean Squared Scaled Error: ", rmsse(actual, predicted))

In [None]:
df_sales_train_evaluation

## Order matters in submission?

In [None]:
google_auto_ml_val_only_wide_submission_same_order=google_auto_ml_val_only_wide_submission.copy()
google_auto_ml_val_only_wide_submission_same_order.id=google_auto_ml_val_only_wide_submission_same_order.id.astype("category")
google_auto_ml_val_only_wide_submission_same_order.id.cat.set_categories(df_sample_submission['id'].tolist(), inplace=True)
google_auto_ml_val_only_wide_submission_same_order.sort_values(["id"], inplace=True)
google_auto_ml_val_only_wide_submission_same_order.reset_index(inplace=True, drop=True)

In [None]:
df_sample_submission

In [None]:
google_auto_ml_val_only_wide_submission

In [None]:
google_auto_ml_val_only_wide_submission_same_order.query("id in ('HOBBIES_1_001_CA_1_validation', 'HOBBIES_1_001_CA_1_evaluation')")

In [None]:
google_auto_ml_val_only_wide_submission_same_order.to_csv("google_auto_ml_val_only_wide_submission_same_order.csv", index=False)
! kaggle competitions submit -f google_auto_ml_val_only_wide_submission_same_order.csv -m google_auto_ml_val_only_wide_submission_same_order  m5-forecasting-accuracy


In [None]:
! kaggle competitions submissions m5-forecasting-accuracy

# Error analysis

In [None]:
df_submission_last_28_days

## sktime naive

In [None]:
! pip install sktime