In [None]:
import os
import time

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns; sns.set(style="ticks", color_codes=True)

from sklearn.metrics import mean_absolute_error as MAE, mean_squared_error as MSE
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_selection import RFE

import matplotlib.pyplot as plt
from IPython.display import display

# **Data Loading**

In [None]:
Datasets = dict()
for ds in ['train', 'test']:
    dataset = pd.read_csv(f"../input/{ds}.csv.zip", sep=',', header=0,
                          names=['Store', 'Dept', 'Date', 'weeklySales', 'isHoliday'] if ds=='train'
                           else ['Store', 'Dept', 'Date', 'isHoliday'])
    features = pd.read_csv("../input/features.csv.zip", sep=',', header=0,
                           names=['Store', 'Date', 'Temperature', 'Fuel_Price', 
                                  'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 
                                  'CPI', 'Unemployment', 'IsHoliday']).drop(columns=['IsHoliday'])
    stores = pd.read_csv("../input/stores.csv", names=['Store', 'Type', 'Size'], sep=',', header=0)
    dataset = dataset.merge(stores, how='left').merge(features, how='left')

    dataset['Date'] = pd.to_datetime(dataset['Date'])
    # dataset["isTomorrowHoliday"] = dataset["isHoliday"].shift(-1).fillna(False)
    display(dataset.head())
    
    Datasets[ds] = dataset

In [None]:
Datasets['train'].dtypes

In [None]:
def describe_missing_values(df: pd.DataFrame):
    miss_val = df.isnull().sum()
    miss_val_percent = 100 * df.isnull().sum() / len(df)
    miss_val_table = pd.concat([miss_val, miss_val_percent], axis=1)
    miss_val_table_ren_columns = miss_val_table.rename(
        columns = {0: 'Missing Values', 
                   1: '% of Total Values',}
    )
    miss_val_table_ren_columns = miss_val_table_ren_columns[
        miss_val_table_ren_columns.iloc[:,1] != 0
    ].sort_values('% of Total Values', ascending=False).round(1)
    
    print(f"Dataframe has {df.shape[1]} columns,")
    print(f"\t\t {miss_val_table_ren_columns.shape[0]} columns that have missing values.")

    return miss_val_table_ren_columns


def visualize_distribution_of_missing_values(df: pd.DataFrame):
    df_nan_check = df.isna().sum().sort_values()
    df_nan_check = df_nan_check.to_dict()
    df_not_nan = []

    nan_cols = 0

    for key, value in df_nan_check.items():
        df_nan_check[key] = int(value/len(df)*100)
        if df_nan_check[key] >= 80:
            nan_cols += 1
        else:
            df_not_nan.append(key)

    # Visualize
    plt.figure(figsize=(9, 6))
    plt.suptitle('Distribution of Empty Values', fontsize=19)
    plt.bar(df_nan_check.keys(), df_nan_check.values())
    plt.xticks(rotation=69)
    plt.show()
    

for ds in ['train', 'test']:
    print(f'\n\n{ds}-set:')
    print(describe_missing_values(Datasets[ds]))
    # visualize_distribution_of_missing_values(dataset)

# **Data Exploration**

In [None]:
def scatter(dataset, column):
    plt.figure()
    plt.scatter(dataset[column] , dataset['weeklySales'], alpha=0.169)
    plt.ylabel('weeklySales')
    plt.xlabel(column)

In [None]:
for col in ['Fuel_Price', 'Size', 'CPI', 'Type', 'isHoliday', 'Unemployment', 'Temperature', 'Store', 'Dept']:
    scatter(Datasets['train'], col)

In [None]:
fig = plt.figure(figsize=(18, 14))
corr = Datasets['train'].corr()
c = plt.pcolor(corr)
plt.yticks(np.arange(0.5, len(corr.index), 1), corr.index)
plt.xticks(np.arange(0.5, len(corr.columns), 1), corr.columns, rotation=45)
fig.colorbar(c)

# **Data Manipulation**

In [None]:
for ds in Datasets.keys():
    # make holidays more specific
    Datasets[ds]['Holiday_Type'] = None
    Datasets[ds].loc[(Datasets[ds]['isHoliday']==True) & 
                     (Datasets[ds]['Date'].dt.month==2), 'Holiday_Type'] = 'Super_Bowl'
    Datasets[ds].loc[(Datasets[ds]['isHoliday']==True) & 
                     (Datasets[ds]['Date'].dt.month==9), 'Holiday_Type'] = 'Labor_Day'
    Datasets[ds].loc[(Datasets[ds]['isHoliday']==True) & 
                     (Datasets[ds]['Date'].dt.month==11), 'Holiday_Type'] = 'Thanksgiving' 
    Datasets[ds].loc[(Datasets[ds]['isHoliday']==True) & 
                     (Datasets[ds]['Date'].dt.month==12), 'Holiday_Type'] = 'Christmax'
    Datasets[ds].drop(columns=['isHoliday'], inplace=True)
    
    # 1-hot encoding for categorical features
    Datasets[ds] = pd.get_dummies(Datasets[ds], columns=["Type", "Holiday_Type"])
    
    # data imputation
    Datasets[ds].fillna(value=0, inplace=True)
    display(Datasets[ds].head())

In [None]:
Datasets['train'].Store.value_counts(sort=False)

# **Modeling**

In [None]:
STORE_ID = 44

## Facebook Prophet

In [None]:
holidays = pd.DataFrame({
    'holiday': ['Super_Bowl']*4 + ['Labor_Day']*4 + ['Thanksgiving']*4 + ['Christmas']*4,
    'ds': pd.to_datetime(['12-02-2010', '11-02-2011', '10-02-2012', '08-02-2013',
                          '10-09-2010', '09-09-2011', '07-09-2012', '06-09-2013',
                          '26-10-2010', '25-10-2011', '23-10-2012', '29-10-2013',
                          '31-12-2010', '30-12-2011', '28-12-2012', '27-12-2013',]),
    'lower_window': 0,
    'upper_window': 1,
})

In [None]:
data_train, data_test = Datasets['train'].copy(), Datasets['test'].copy()
data_train.rename(columns={'Date': 'ds', 'weeklySales': 'y'}, inplace=True)
data_train.head()

In [None]:
from fbprophet import Prophet

Models = dict()

for name, group in data_train.groupby(["Store", "Dept"]):
    
    if name[0] != STORE_ID:
        continue
        
    data_grouped = group.drop(columns=["Store", "Dept"])
    print(f"\n Training Facebook's Prophet for store={name[0]}, dept={name[1]} with {len(data_grouped)} samples ...")
    if len(data_grouped) < 3:
        print(f"\t\t Number of samples must be larger than 2 !!!")
        Models[name] = [None, np.mean(group['y'])]
        continue
    
    # Creating model
    model = Prophet(
        growth='linear', # linear or logistic
        changepoints=None, # list of dates at which to include potential changepoints
        n_changepoints=11, # number of potential changepoints
        changepoint_range=0.69, # proportion of history in which trend changepoints will be estimated
        yearly_seasonality='auto',
        weekly_seasonality='auto',
        daily_seasonality='auto',
        holidays=holidays,
        seasonality_mode='additive',
        seasonality_prior_scale=6.9,
        holidays_prior_scale=6.9,
        changepoint_prior_scale=0.169,
        mcmc_samples=0, # if > 0: Bayesian inference with number of MCMC samples, else: MAP estimation
        interval_width=0.69, # width of the uncertainty intervals provided for the forecast
        uncertainty_samples=690 # number of simulated draws used to estimate uncertainty intervals
    )

    for col in ['Size', 'CPI', 'Unemployment']:
        model.add_regressor(name=col, prior_scale=None, standardize='auto', mode='additive')
        
    # Training model        
    t1 = time.time()
    model.fit(data_grouped)
    t2 = time.time()
    print(f"\t\t ... in {round(t2-t1, 3)} seconds")

    Models[name] = [model, np.mean(group['y'])]

In [None]:
data_test.rename(columns={'Date': 'ds'}, inplace=True)
for col in data_train.columns:
    if col in ['ds', 'y']:
        continue
    if col not in list(data_test.columns):
        data_test[col] = 0

result = []
for name, group in data_test.groupby(["Store", "Dept"]):

    if name[0] != STORE_ID:
        continue
        
    data_grouped = group.drop(columns=["Store", "Dept"])
    print(f"\n Predicting Facebook's Prophet for store={name[0]}, dept={name[1]} with {len(data_grouped)} samples ...")
    if name not in list(Models.keys()):
        forecast = group.copy()
        forecast['yhat'] = np.mean(data_train.y)
    else:
        model, mean_value = Models[name]
        if model is None:
            forecast = group.copy()
            forecast['yhat'] = mean_value
        else:
            try:
                t1 = time.time()
                forecast = model.predict(df=data_grouped)
                t2 = time.time()
                print(f"\t ... in {round(t2-t1, 3)} seconds")
                forecast["Store"] = name[0]
                forecast["Dept"] = name[1]
            except Exception as e:
                print(e)
                forecast = group.copy()
                forecast['yhat'] = mean_value
            
    # 1st-date: Friday, 05-02-2010
    # Models[name].plot_components(forecast, weekly_start=5, yearly_start=31+5)
    forecast['yhat'][forecast['yhat']<0] = 20
    result.append(forecast[['Store', 'Dept', 'ds', 'yhat']])

In [None]:
result = pd.concat(result, axis=0, ignore_index=True)
result.rename(columns={'yhat': 'Weekly_Sales'}, inplace=True)
result['Id'] = result['Store'].apply(str) + '_' + result['Dept'].apply(str) + '_' + result['ds'].dt.strftime('%Y-%m-%d')

result[['Id', 'Weekly_Sales']].to_csv(f'submission_Prophet_store={STORE_ID}.csv', index=False)

display(result.head())

## **SARIMAX** - **S**easonal **A**uto**R**egressive **I**ntegrated **M**oving **A**verage with e**X**ogenous regressors

In [None]:
data_train, data_test = Datasets['train'].copy(), Datasets['test'].copy()

data_train.set_index(keys='Date', drop=True, inplace=True)
data_test.set_index(keys='Date', drop=True, inplace=True)

for col in data_train.columns:
    if col in ['weeklySales']:
        continue
    if col not in list(data_test.columns):
        data_test[col] = 0

display(data_train[(data_train.Store==1) & (data_train.Dept==1)].tail())
display(data_test[(data_test.Store==1) & (data_test.Dept==1)].head())

In [None]:
from statsmodels.tsa.statespace.sarimax import SARIMAX

steps = -1
Models = dict()
results = list()

for name, group_test in data_test.groupby(["Store", "Dept"]):
        
    if name[0] != STORE_ID:
        continue
    
    print(f"\n SARIMAX for store={name[0]}, dept={name[1]}")
    
    test_size = len(group_test)
    if test_size < 1:
        continue
    
    # preparing data
    group = data_train[(data_train["Store"]==name[0]) & (data_train["Dept"]==name[1])]
    train_size = len(group)
    if train_size < 1:
        predictions = group_test.copy()
        predictions['Weekly_Sales'] = np.mean(data_train['weeklySales'])
    else:
        data_grouped = group.copy()
        data_grouped.drop(columns=["Store", "Dept"], inplace=True)
        data_grouped.index = pd.DatetimeIndex(data=data_grouped.index.values,
                                              freq=data_grouped.index.inferred_freq)
        features_imposed = data_grouped['weeklySales'].shift(steps)
        features_exposed = data_grouped.drop(columns=['weeklySales'])
        seasonal_order = 52 if train_size>52 else 1

        try:
            # Creating model
            model = SARIMAX(endog=features_imposed, 
                            exog=features_exposed, 
                            order=(1, 0, 0), # p,d,q - number of AR parameters, differences, and MA parameters
                            seasonal_order=(0, 1, 0, seasonal_order), # P,D,Q,s - AR parameters, differences, MA parameters, and periodicity
                            trend='ct', # c: const - t: time
                            enforce_invertibility=False, 
                            enforce_stationarity=False)

            # Training model
            print(f"\t Training with {train_size} samples ...")
            t1 = time.time()
            forecaster = model.fit()
            t2 = time.time()
            print(f"\t\t ... in {round(t2-t1, 3)} seconds")
            # print(forecaster.summary())

            # Models[name] = forecaster

            # Predicting
            features = group_test.copy()
            features.drop(columns=["Store", "Dept"], inplace=True)
            features.index = pd.DatetimeIndex(data=features.index.values,
                                              freq=features.index.inferred_freq)

            print(f"\t Predicting with {test_size} samples ...")
            t1 = time.time()
            predictions = forecaster.predict(start=train_size, 
                                             end=train_size+test_size-1, 
                                             exog=features)
            t2 = time.time()
            print(f"\t\t ... in {round(t2-t1, 3)} seconds")

            predictions = predictions.to_frame(name='Weekly_Sales')
            predictions["Store"] = name[0]
            predictions["Dept"] = name[1]

        except Exception as e:
            print(e)
            predictions = group_test.copy()
            predictions['Weekly_Sales'] = np.mean(group['weeklySales'])
               
    predictions = predictions[["Store", "Dept", 'Weekly_Sales']]
    predictions.reset_index(inplace=True)
    predictions['Date'] = group_test.index
    predictions['Weekly_Sales'][predictions['Weekly_Sales']<0] = np.mean(group['weeklySales'])
    
    assert len(predictions)==test_size, f"[xxx] #predictions = {len(predictions)} != {test_size}"
    results.append(predictions)

In [None]:
result = pd.concat(results, axis=0, ignore_index=True)
# result.to_csv('test.csv', index=False)
result['Date'] = pd.to_datetime(result.Date, format='%Y-%m-%d %H:%M:%S')
result['Id'] = result['Store'].astype(int).apply(str) + '_' + result['Dept'].astype(int).apply(str) + '_' + result['Date'].dt.strftime('%Y-%m-%d')
result[['Id', 'Weekly_Sales']].to_csv(f'submission_SARIMAX_store={STORE_ID}.csv', index=False)

display(result.head())