In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import seaborn as sns
sns.set_style('whitegrid')
sns.set_context('poster')

import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport
from IPython.display import HTML

from tqdm import tqdm

%matplotlib inline
plt.rcParams["figure.figsize"] = (16,12)
plt.rcParams['axes.titlesize'] = 16

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# load dataset
train_sales = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sales_train_validation.csv')
sell_prices = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sell_prices.csv')
calendar = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/calendar.csv')
submission = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sample_submission.csv')

In [None]:
#quick look at data shapes
print(train_sales.shape)
print(sell_prices.shape)
print(calendar.shape)
print(submission.shape)

In [None]:
#samples of train set
train_sales.sample(2)

In [None]:
#sample of prices set
sell_prices.sample(2)

In [None]:
#samples of calendar set
calendar.sample(2)

## This code should be done on local machine due to kernel memory constraints

In [None]:

# Create a list comprehension for all the date columns to melt.
d_cols = ['d_' + str(i + 1) for i in range(1913)]

# Melt columns into rows so that each row is a separate and discrete entry with one target
tidy_df = pd.melt(frame = train_sales, 
                 id_vars = ['id', 'item_id', 'cat_id', 'store_id'],
                 var_name = 'd',
                 value_vars = d_cols,
                 value_name = 'sales')

# This has duplicate ID's now.  We should add the date to the id to make each row unique.
new_ids = tidy_df['id'] + '_' + tidy_df['d']
tidy_df['id'] = new_ids

# Check this turned out ok so far.
tidy_df.head()

In [None]:
# Merge the prices.  
# NOTE - For now we are aggregating on the mean price of each item.
# TO DO: We will want to set the price with the week or run some statistics on price volatility over time.

price_means = sell_prices.groupby(['item_id']).mean()

# Now, merge this and the date col
with_prices_df = pd.merge(left = tidy_df, right = calendar,
                        on = 'd')

with_prices_df.head(10)
# Let's see the results.

In [None]:
with_date_info_df = pd.merge(left = with_prices_df, right = price_means,
                        on = 'item_id')

total_tidy_df = with_date_info_df
total_tidy_df.columns

# Drop d and drop item_id (price is an informative proxy)
total_tidy_df.drop(['d', 'wday', 'item_id'], axis = 1, inplace = True)

# fill categorical NaNs with 0's.
total_tidy_df = total_tidy_df.fillna(0)

print(with_date_info_df.iloc[0])



In [None]:
# Categorical encoded column helper function.
def categorically_encode_col(df, col):
    encoded_df = pd.get_dummies(df[col], 
                                prefix = str(col),
                               drop_first = False)

    return encoded_df

total_tidy_df.columns

# Categorically encode the categorical columns and then drop the originals.
# This makes them ML ready.

if CREATE_TIDY_DF:
    
    # Categorically encode categorical columns
    cols_to_encode = ['cat_id', 'store_id', 'weekday', 'event_type_1', 'event_type_2' ]
    
    for col in cols_to_encode:
        new_cols = pd.DataFrame(categorically_encode_col(total_tidy_df, col))
        total_tidy_df = pd.concat([total_tidy_df, new_cols], axis = 1)
        # total_tidy_df.drop(col, inplace = True)  # Drop the un-encoded column

In [None]:
total_tidy_df.columns

## Yearly Cycle Decompose of CA_1 store

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose
days_per_year = 365

time_series = store_sum["CA_1"]
sj_sc = seasonal_decompose(time_series, period = days_per_year)
sj_sc.plot()

plt.show()

1. ## Weekly cycle decompose of CA_1 store ##

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose
days_per_week = 7

time_series = store_sum["CA_1"]
sj_sc = seasonal_decompose(time_series, period = days_per_week)
sj_sc.plot()

plt.show()

## Baseline Sarima Model ##

In [None]:
from statsmodels.tsa.statespace.sarimax import SARIMAX

def sarima_train_test(t_series, p = 2, d = 1, r = 2, NUM_TO_FORECAST = 56, do_plot_results = True):
    NUM_TO_FORECAST = NUM_TO_FORECAST  # Similar to train test splits.
    dates = np.arange(t_series.shape[0])

    model = SARIMAX(t_series, order = (p, d, r), trend = 'c')
    results = model.fit()
    results.plot_diagnostics(figsize=(18, 14))
    plt.show()

    forecast = results.get_prediction(start = -NUM_TO_FORECAST)
    mean_forecast = forecast.predicted_mean
    conf_int = forecast.conf_int()

    print(mean_forecast.shape)

    # Plot the forecast
    plt.figure(figsize=(14,16))
    plt.plot(dates[-NUM_TO_FORECAST:],
            mean_forecast.values,
            color = 'red',
            label = 'forecast')


    plt.plot(dates[-NUM_TO_FORECAST:],
            t_series.iloc[-NUM_TO_FORECAST:],
            color = 'blue',
            label = 'actual')
    plt.legend()
    plt.title('Predicted vs. Actual Values')
    plt.show()
    
    residuals = results.resid
    mae_sarima = np.mean(np.abs(residuals))
    print('Mean absolute error: ', mae_sarima)
    print(results.summary())


In [None]:
sarima_train_test(time_series)

In [None]:

sarima_preds = pd.read_csv('/kaggle/input/sarima_submission.csv')
sarima_preds[sarima_preds < 0] = 0  # Convert all negative numbers into 0.
sarima_preds['id']= submission['id']

submission_df = sarima_preds

#Cleaning
submission_df = submission_df.iloc[:,:29]
submission_df = submission_df.drop(['Unnamed: 0'], axis = 1)
submission_df.index = submission_file['id']
submission_df.reset_index(inplace = True)
submission_df.columns = submission_file.columns
submission_df.head()

sarima_df = submission_df.copy()
sarima_df.to_csv('SARIMA.csv', index = False)