In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. READ DATA

In [None]:
# Read data file csv
df = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv')

In [None]:
# Size of dataframe
df.shape

In [None]:
# Print head of dataframe
df.head()

In [None]:
# Fillter data to visualization, filter top shop best seller by [date, shop_id, item_id]

In [None]:
df['shop_id'].value_counts()

In [None]:
lst_top_shop = df['shop_id'].value_counts()

In [None]:
df_top_shop = df[df['shop_id'] == lst_top_shop.index[0]]

In [None]:
df_top_shop

# 2. Preprocess data top 1 shop

In [None]:
sales = df_top_shop.groupby(["date_block_num","shop_id","item_id"])[['date_block_num','date', 'shop_id', 'item_id', 'item_price', 'item_cnt_day']].agg({"date_block_num":'mean',"date":["min",'max'],"item_price":"mean","item_cnt_day":"sum"})

In [None]:
sales

In [None]:
sales = sales.item_cnt_day.apply(list).reset_index()

In [None]:
sales.head()

In [None]:
sales_data = sales.pivot_table(index = ['shop_id', 'item_id'], columns='date_block_num', values='sum', aggfunc='sum')

In [None]:
sales_data.fillna(0, inplace=True)

In [None]:
sales_data

# 3. Predict & visualization

In [None]:
from matplotlib import pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

from statsmodels.tsa.holtwinters import ExponentialSmoothing as HWES

In [None]:
def MAPE(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / np.maximum(np.ones(len(y_true)), np.abs(y_true))))*100

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
def Average(lst):
    return sum(lst) / len(lst)

## holt winters

In [None]:
dfids = pd.DataFrame(sales_data.iloc[6][:])
dfids.columns = ['sale']
dfids.index.freq = 'MS'
dfids['sale'] = dfids['sale'] + 0.01 # Cong tat ca cac so sale len 0.01

if dfids[dfids['sale']==0.01].shape[0] < 20: # Chi chay predict cho nhung item co so so 0 nho hon 20 (20 thang trong 33)
    #plot the data
    #dfids.plot()
    #plt.show()

    from statsmodels.tsa.holtwinters import ExponentialSmoothing as HWES

    print("Name of dataframe row: ", 6)
    #split between the training and the test data sets. The last 12 periods form the test data
    df_train = dfids.iloc[:-6]
    df_test = dfids.iloc[-6:]
    #build and train the model on the training data
    model = HWES(df_train, seasonal_periods=12, trend='add', seasonal='add')
    fitted = model.fit(optimized=True, use_brute=True)
    #create an out of sample forcast for the next 12 steps beyond the final data point in the training data set
    sales_forecast = fitted.forecast(steps=6)
    mse_  = mean_squared_error(df_test, sales_forecast)
    mae_  = mean_absolute_error(df_test, sales_forecast)
    mape_ = MAPE(df_test, sales_forecast)
    print("evaluation metric mse = {}, mae = {}, mape = {}".format(round(mse_, 3),round(mae_, 3),round(mape_, 3)))

    #plot the training data, the test data and the forecast on the same plot
    #fig = plt.figure()
    fig, ax = plt.subplots()
    ci = 1.96 * np.std(sales_forecast)/np.sqrt(len(sales_forecast))
    fig.suptitle('Retail Sales')
    past, = plt.plot(df_train.index, df_train, 'b.-', label='Sales History')
    future, = plt.plot(df_test.index, df_test, 'r.-', label='Actual Sales')
    predicted_future, = plt.plot(df_test.index, sales_forecast, 'g.-', label='Sales Forecast')
    
    ax.fill_between(df_test.index, (sales_forecast-ci), (sales_forecast+ci), color='b', alpha=.1)
    
    plt.legend(handles=[past, future, predicted_future])
    plt.show()
    print("_"*50)

In [None]:
count_stop = 0

lst_mse = []
lst_mae = []
lst_mape = []
for i in range(sales_data.shape[0]):
    dfids = pd.DataFrame(sales_data.iloc[i][:])
    dfids.columns = ['sale']
    dfids.index.freq = 'MS'
    dfids['sale'] = dfids['sale'] + 0.01 # Cong tat ca cac so sale len 0.01
    
    if dfids[dfids['sale']==0.01].shape[0] < 20: # Chi chay predict cho nhung item co so so 0 nho hon 20 (20 thang trong 33)
        #plot the data
        #dfids.plot()
        #plt.show()
        try:
            from statsmodels.tsa.holtwinters import ExponentialSmoothing as HWES
            
            print("Name of dataframe row: ", i)
            #split between the training and the test data sets. The last 12 periods form the test data
            df_train = dfids.iloc[:-6]
            df_test = dfids.iloc[-6:]

            #build and train the model on the training data
            model = HWES(df_train, seasonal_periods=12, trend='add', seasonal='add')
            fitted = model.fit(optimized=True, use_brute=True)

            #create an out of sample forcast for the next 12 steps beyond the final data point in the training data set
            sales_forecast = fitted.forecast(steps=6)
            mse_  = mean_squared_error(df_test, sales_forecast)
            mae_  = mean_absolute_error(df_test, sales_forecast)
            mape_ = MAPE(df_test, sales_forecast)
            lst_mse.append(mse_)
            lst_mae.append(mae_)
            lst_mape.append(mape_)
            print("evaluation metric mse = {}, mae = {}, mape = {}".format(round(mse_, 3),round(mae_, 3),round(mape_, 3)))
            
            #plot the training data, the test data and the forecast on the same plot
            #fig = plt.figure()
            fig, ax = plt.subplots()
            ci = 1.96 * np.std(sales_forecast)/np.sqrt(len(sales_forecast))
            fig.suptitle('Retail Sales')
            past, = plt.plot(df_train.index, df_train, 'b.-', label='Sales History')
            future, = plt.plot(df_test.index, df_test, 'r.-', label='Actual Sales')
            predicted_future, = plt.plot(df_test.index, sales_forecast, 'g.-', label='Sales Forecast')
            plt.legend(handles=[past, future, predicted_future])
            ax.fill_between(df_test.index, (sales_forecast-ci), (sales_forecast+ci), color='b', alpha=.1)
            plt.show()
            print("_"*50)
            count_stop = count_stop + 1
        except:
            pass
    if count_stop == 10:
        break

In [None]:
lst_mse

In [None]:
lst_mae

In [None]:
lst_mape

In [None]:
Average(lst_mse)

In [None]:
Average(lst_mae)

In [None]:
Average(lst_mape)

## ARIMA

In [None]:
from statsmodels.tsa.arima.model import ARIMA

from math import sqrt

In [None]:
dfids = pd.DataFrame(sales_data.iloc[6][:])
dfids.columns = ['sale']
dfids.index.freq = 'MS'
dfids['sale'] = dfids['sale'] + 0.01


print("Name of dataframe row: ", 6)
X = dfids['sale'].values
train, test = X[:-6], X[-6:]
history = [x for x in train]
predictions = list()

# walk-forward validation
for t in range(len(test)):
    model = ARIMA(history, order=(1,1,0))
    model_fit = model.fit()
    output = model_fit.forecast()
    yhat = output[0]
    predictions.append(yhat)
    obs = test[t]
    history.append(obs)

# evaluate forecasts
mse_  = mean_squared_error(test, predictions)
mae_  = mean_absolute_error(test, predictions)
mape_ = MAPE(test, predictions)
print("evaluation metric mse = {}, mae = {}, mape = {}".format(round(mse_, 3),round(mae_, 3),round(mape_, 3)))

# plt.plot(test)
# plt.plot(predictions, color='red')
# plt.show()
# print("_"*50)


fig, ax = plt.subplots()
ci = 1.96 * np.std(predictions)/np.sqrt(len(predictions))
fig.suptitle('Retail Sales')
past, = plt.plot(dfids[:-6].index, train, 'b.-', label='Sales History')
future, = plt.plot(dfids[-6:].index, test, 'r.-', label='Actual Sales')
predicted_future, = plt.plot(dfids[-6:].index, predictions, 'g.-', label='Sales Forecast')
plt.legend(handles=[past, future, predicted_future])
ax.fill_between(df_test.index, (predictions-ci), (predictions+ci), color='b', alpha=.1)
plt.show()

In [None]:
count_stop = 0

lst_arima_mse = []
lst_arima_mae = []
lst_arima_mape = []
for i in range(sales_data.shape[0]):
    dfids = pd.DataFrame(sales_data.iloc[i][:])
    dfids.columns = ['sale']
    dfids.index.freq = 'MS'
    dfids['sale'] = dfids['sale'] + 0.01
    
    if dfids[dfids['sale']==0.01].shape[0] < 20:
        try:
            print("ARIMA: Name of dataframe row: ", i)
            X = dfids['sale'].values
            train, test = X[:-6], X[-6:]
            history = [x for x in train]
            predictions = list()

            # walk-forward validation
            for t in range(len(test)):
                model = ARIMA(history, order=(1,1,0))
                model_fit = model.fit()
                output = model_fit.forecast()
                yhat = output[0]
                predictions.append(yhat)
                obs = test[t]
                history.append(obs)

            # evaluate forecasts
            mse_  = mean_squared_error(test, predictions)
            mae_  = mean_absolute_error(test, predictions)
            mape_ = MAPE(test, predictions)
            lst_arima_mse.append(mse_)
            lst_arima_mae.append(mae_)
            lst_arima_mape.append(mape_)
            print("evaluation metric mse = {}, mae = {}, mape = {}".format(round(mse_, 3),round(mae_, 3),round(mape_, 3)))

#             plt.plot(test)
#             plt.plot(predictions, color='red')
#             plt.show()
            fig, ax = plt.subplots()
    
            ci = 1.96 * np.std(predictions)/np.sqrt(len(predictions))
            fig.suptitle('Retail Sales')
            past, = plt.plot(dfids[:-6].index, train, 'b.-', label='Sales History')
            future, = plt.plot(dfids[-6:].index, test, 'r.-', label='Actual Sales')
            predicted_future, = plt.plot(dfids[-6:].index, predictions, 'g.-', label='Sales Forecast')
            plt.legend(handles=[past, future, predicted_future])
            ax.fill_between(df_test.index, (predictions-ci), (predictions+ci), color='b', alpha=.1)
            plt.show()
            print("_"*50)
            
            count_stop = count_stop + 1
        except:
            pass
    if count_stop == 10:
        break

In [None]:
lst_arima_mse

In [None]:
lst_arima_mae

In [None]:
lst_arima_mape

In [None]:
Average(lst_arima_mse)

In [None]:
Average(lst_arima_mae)

In [None]:
Average(lst_arima_mape)

In [None]:
len(lst_arima_mape)