# Intro
Welcome to the [M5 Forecasting - Accuracy](https://www.kaggle.com/c/m5-forecasting-accuracy) competition.
![](https://storage.googleapis.com/kaggle-competitions/kaggle/18599/logos/header.png)

# Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import time

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBRegressor

In [None]:
import multiprocessing as mp

In [None]:
import warnings
warnings.filterwarnings("ignore")

# Path

In [None]:
path_in = '/kaggle/input/m5-forecasting-accuracy/'
os.listdir(path_in)

# Load Data

In [None]:
sales = pd.read_csv(path_in+'sales_train_validation.csv')
cal = pd.read_csv(path_in+'calendar.csv')
prices = pd.read_csv(path_in+'sell_prices.csv')
samp_subm = pd.read_csv(path_in+'sample_submission.csv')

# EDA

In [None]:
print('sales shape (rows, cols): ', sales.shape)
print('cal shape (rows, cols): ', cal.shape)
print('prices shape (rows, cols): ', prices.shape)
print('subm shape (rows, cols): ', samp_subm.shape)

## Sales Data

In [None]:
sales.head()

In [None]:
features_cat = ['cat_id', 'state_id']
cal.fillna('empty', inplace=True)
le = LabelEncoder()
for col in features_cat:
    le.fit(sales[col])
    sales[col] = le.transform(sales[col])

## Calendar Data

In [None]:
cal.head()

In [None]:
features_cat = ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
cal.fillna('empty', inplace=True)
le = LabelEncoder()
for col in features_cat:
    le.fit(cal[col])
    cal[col] = le.transform(cal[col])

Count the number of the feature wday:

In [None]:
group_wday = cal.groupby('wday').count()
group_wday

## Price Data

In [None]:
prices.head()

In [None]:
samp_subm.head()

# Functions

In [None]:
sales[sales.columns[6:]].iloc[0].values

In [None]:
def plot_timeseries(data, index):
    fig = plt.figure(figsize=(16,9))
    ax = fig.add_subplot(111)
    x = range(1, 1913+1)
    y = data[data.columns[6:]].iloc[index].values
    ax.plot(x, y, linewidth=2.8, label=index)
    plt.legend(loc='upper center')
    plt.xlabel('days')
    plt.ylabel('number of sales')
    plt.grid()

In [None]:
def plot_sales_and_preds(sales, preds, index, skip_days=0):
    fig = plt.figure(figsize=(16,9))
    ax = fig.add_subplot(111)
    # plot sales
    x_sales = range(1+skip_days, 1913+1)
    y_sales = sales[sales.columns[6+skip_days:]].iloc[index].values
    ax.plot(x_sales, y_sales, linewidth=2.8, color='blue', label='sale')
    
    # plot vals
    x_val = range(1913+1, 1941+1)
    y_val = preds[preds.columns[1:]].iloc[index].values
    ax.plot(x_val, y_val, linewidth=2.8, color='orange', label='val')
    
    x_eval = range(1941+1, 1969+1)
    y_eval = preds[preds.columns[1:]].iloc[index+30490].values
    ax.plot(x_eval, y_eval, linewidth=2.8, color='green', label='eval')
    
    
    plt.legend(loc='upper center')
    plt.xlabel('days')
    plt.ylabel('number of sales')
    plt.grid()

In [None]:
plot_timeseries(sales, 200)

# Simple Prediction
Use the columns: wday, month, year and sell_price.

In [None]:
def predict_article(articles_list):
    skip_days = 0
    results = []
    for article in articles_list:
        #print(article)
        merge_on = ['wm_yr_wk', 'store_id']
        X_train_org['store_id'] = sales.loc[article, 'store_id']
        X_train_org['cat_id'] = sales.loc[article, 'cat_id']
        X_train_org['state_id'] = sales.loc[article, 'state_id']
        item = sales.loc[article, 'item_id']
   
        X_train = pd.merge(X_train_org, prices[prices['item_id']==item], on=merge_on, how='left')
    
        #features = ['wday', 'month', 'year', 'sell_price']
        features = ['wday', 'month', 'year', 'sell_price', 'snap_CA', 'snap_TX', 'snap_WI']
                    #'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
                    #'cat_id', 'state_id']
    
        X_train = X_train[features]
        X_train.fillna(X_train.mean(), inplace=True)
        y_train = sales[sales.columns[6+skip_days:]].iloc[article].values
        y_train = np.log1p(y_train)
    
        # scale data
        scaler.fit(X_train)
        X_train_scale = scaler.transform(X_train)
    
        # train model
        model.fit(X_train_scale[1+skip_days:1913+1], y_train)
        preds_val = model.predict(X_train_scale[1913+1:1941+1])
        #y_val = np.append(y_train, preds_val)
        #model.fit(X_train_scale[1:1941+1], y_val)
        #preds_eval = model.predict(X_train_scale[1941:1969+1])
        preds_eval = np.array([0 for i in range(28)]) 
        
        preds_val = np.expm1(preds_val)
        #preds_eval = np.expm1(preds_eval)
        results.append(preds_val)
        results.append(preds_eval)

    return results 
    

Test

In [None]:
article_list = [i for i in range(0, 1)]
article_list

sales_article = pd.DataFrame()
sales_article['d'] = ['d_'+str(i) for i in range(1, 1969+1)]

# merge with cal data
X_train_org = pd.merge(sales_article, cal, on='d')

# create new features for merge
X_train_org['store_id'] = None

# define scaler
scaler = MinMaxScaler(feature_range=(0, 1))

#define model
model = XGBRegressor(objective='reg:squarederror', learning_rate=0.1,
                     max_depth=3, n_estimators=200, random_state=2020)

results = predict_article(article_list)

In [None]:
results

In [None]:
def predict_by_mean(articles_list):
    num_vals = 5
    shift = 7
    results = []
    
    for article in articles_list:
        y_train = sales[sales.columns[6:]].iloc[article].values.tolist()
        
        # predict first 28 days - val
        preds_val = []
        for day in range(28):
            value = 0
            for days_back in range(1, num_vals+1):
                idx = 1913+day-days_back*shift
                value += y_train[idx]
            value = value/num_vals
            preds_val.append(value)
            y_train.append(value)
            
        # predict second 28 days - eval
        preds_eval = []
        value = 0
        for day in range(28):
            for days_back in range(1, num_vals+1):
                idx = 1941+day-days_back*shift
                value += y_train[idx]
            value = 0 #value/num_vals
            preds_eval.append(value)
            y_train.append(value)
        
        results.append(preds_val)
        results.append(preds_eval)
        
    return results

In [None]:
article_list = [7623]#[i for i in range(0, 1)]
results = predict_by_mean(article_list)

Prod

In [None]:
# sales_article = pd.DataFrame()
# sales_article['d'] = ['d_'+str(i) for i in range(1, 1969+1)]

# # merge with cal data
# X_train_org = pd.merge(sales_article, cal, on='d')

# # create new features for merge
# X_train_org['store_id'] = None

# # define scaler
# scaler = MinMaxScaler(feature_range=(0, 1))

# # define model
# #objective: reg:squarederror
# model = XGBRegressor(objective='reg:squarederror', learning_rate=0.1,
#                          max_depth=3, n_estimators=100, random_state=2020)

# cores = 4
# article_lists = np.array_split(range(len(sales.index)), cores, axis=0)
# start = time.time()
# pool = mp.Pool(cores)
# results = np.vstack(pool.map(predict_article, article_lists))
# pool.close()
# pool.join()
# end = time.time()
# print(end-start)


In [None]:
cores = 4
article_lists = np.array_split(range(len(sales.index)), cores, axis=0)
start = time.time()
pool = mp.Pool(cores)
results = np.vstack(pool.map(predict_by_mean, article_lists))
pool.close()
pool.join()
end = time.time()
print(end-start)

In [None]:
(end-start)*36490/10/3600

For testing of 10 articles:

In [None]:
# samp_subm.loc[0:10-1, samp_subm.columns[1:]] = results[0:len(results):2]
# samp_subm.loc[30490:30490+10-1, samp_subm.columns[1:]] = results[1:len(results):2]

In [None]:
samp_subm.head()

For all articles:

In [None]:
samp_subm.loc[0:30490-1, samp_subm.columns[1:]] = results[0:len(results):2]
samp_subm.loc[30490:30490+30490-1, samp_subm.columns[1:]] = results[1:len(results):2]

# Export Submission

In [None]:
samp_subm.to_csv('submission.csv', index=False)

# Plot Results

In [None]:
plot_sales_and_preds(sales, samp_subm, 5400, skip_days=1400)