In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import warnings
from scipy import stats
from itertools import product
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression, RidgeCV, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, cross_validate
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess
from statsmodels.tsa.stattools import adfuller
from learntools.time_series.utils import plot_periodogram, seasonal_plot
from xgboost import XGBRegressor


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Plan
1. Visualization
2. Data preparation
3. Choise a model
4. Tuning model

In [None]:
def ad (df):
    return print('p-value = {}'.format(adfuller(df)[1]))

# Upload files
## train.csv
The training data, comprising time series of features store_nbr, family, and onpromotion as well as the target sales.
store_nbr identifies the store at which the products are sold.
family identifies the type of product sold.
sales gives the total sales for a product family at a particular store at a given date. Fractional values are possible since products can be sold in fractional units (1.5 kg of cheese, for instance, as opposed to 1 bag of chips).
onpromotion gives the total number of items in a product family that were being promoted at a store at a given date.

In [None]:
df_train = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/train.csv', parse_dates=['date'], infer_datetime_format=True)
df_train['date'] = df_train.date.dt.to_period('D')
store_sales = df_train.set_index(['store_nbr', 'family', 'date']).sort_index()
prom = (store_sales.groupby('date').mean().squeeze())['onpromotion'].to_frame()
average_sales = (store_sales.groupby('date').mean().squeeze())['sales'].to_frame()

In [None]:
store_sales

In [None]:
df_train.head()

## test.csv
The test data, having the same features as the training data. You will predict the target sales for the dates in this file.
The dates in the test data are for the 15 days after the last date in the training data.

In [None]:
df_test = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/test.csv', parse_dates=['date'])
df_test['date'] = df_test.date.dt.to_period('D')
df_test.head()

## stores.csv
Store metadata, including city, state, type, and cluster.
cluster is a grouping of similar stores.

In [None]:
stores = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/stores.csv', index_col='store_nbr')
stores.head()

## oil.csv
Daily oil price. Includes values during both the train and test data timeframes. (Ecuador is an oil-dependent country and it's economical health is highly vulnerable to shocks in oil prices.)

In [None]:
oil = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/oil.csv', parse_dates=['date'])
oil = oil.set_index('date').to_period('d')
oil.head()

## holidays_events.csv
Holidays and Events, with metadata
NOTE: Pay special attention to the transferred column. A holiday that is transferred officially falls on that calendar day, but was moved to another date by the government. A transferred day is more like a normal day than a holiday. To find the day that it was actually celebrated, look for the corresponding row where type is Transfer. For example, the holiday Independencia de Guayaquil was transferred from 2012-10-09 to 2012-10-12, which means it was celebrated on 2012-10-12. Days that are type Bridge are extra days that are added to a holiday (e.g., to extend the break across a long weekend). These are frequently made up by the type Work Day which is a day not normally scheduled for work (e.g., Saturday) that is meant to payback the Bridge.
Additional holidays are days added a regular calendar holiday, for example, as typically happens around Christmas (making Christmas Eve a holiday).

In [None]:
holiday = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv', parse_dates=['date'])
holiday = holiday.set_index('date').to_period('D')
holiday.head()

In [None]:
transaction = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/transactions.csv', parse_dates=['date'])
transaction = transaction.set_index('date').to_period('d')
transaction.tail()

# See plots

## 1. Date

In [None]:
sales_plot = average_sales.copy()

In [None]:
plt.figure(figsize=(20,6))
plt.plot(sales_plot.loc['2016']['sales'].values, label='2016')
plt.plot(sales_plot.loc['2017']['sales'].values, label='2017')
plt.legend();

In [None]:
sales_plot.loc['2016']['sales'].plot(figsize=(20,6));

In [None]:
#sales_plot.loc['2017'].rolling(6).mean().ewm(alpha=0.05).mean()

In [None]:
b_days = sales_plot.copy()
b_days['dayofweek'] = b_days.index.day_of_week
plt.figure(figsize=(20,6))
plt.plot(b_days.loc['2015'].groupby('dayofweek')['sales'].mean().values, label='2015')
plt.plot(b_days.loc['2016'].groupby('dayofweek')['sales'].mean().values, label='2016')
plt.plot(b_days.loc['2017'].groupby('dayofweek')['sales'].mean().values, label='2017')
plt.legend();

In [None]:
percent_days = b_days.loc['2016'].groupby('dayofweek')['sales'].mean()
percent_days = percent_days.map(lambda x: x *100 / percent_days.sum())
plt.plot(percent_days)
percent_days

In [None]:
week = sales_plot.resample('w').mean()
plt.figure(figsize=(20,6))
plt.plot(week.loc['2015']['sales'].values, label='2015')
plt.plot(week.loc['2016']['sales'].values, label='2016')
plt.plot(week.loc['2017']['sales'].values, label='2017')
plt.legend();

In [None]:
week.loc['2016'].plot(figsize=(20,6));

In [None]:
week['week'] = week.index.week
percent_weeks = week.loc['2015':].groupby('week')['sales'].mean()
percent_weeks = percent_weeks.map(lambda x: x *100 / percent_weeks.sum())
percent_weeks.head()

In [None]:
month = sales_plot.resample('m').mean()
plt.figure(figsize=(20,6))
plt.plot(month.loc['2015']['sales'].values, label='2015')
plt.plot(month.loc['2016']['sales'].values, label='2016')
plt.plot(month.loc['2017']['sales'].values, label='2017')
plt.legend();

In [None]:
month['month'] = month.index.month
percent_m = month.loc['2014':].groupby('month')['sales'].mean()
percent_m = percent_m.map(lambda x: x *100 / percent_m.sum())
percent_m

In [None]:
quarter= sales_plot.resample('q').mean()
plt.figure(figsize=(20,6))
plt.plot(quarter.loc['2015']['sales'].values, label='2015')
plt.plot(quarter.loc['2016']['sales'].values, label='2016')
plt.plot(quarter.loc['2017']['sales'].values, label='2017')
plt.legend();

In [None]:
quarter['q'] = quarter.index.quarter
percent_q = quarter.loc['2016':].groupby('q')['sales'].mean()
percent_q = percent_q.map(lambda x: x *100 / percent_q.sum())
percent_q

## 2. Holidays

In [None]:
holiday.head()

In [None]:
holiday.type.value_counts()

In [None]:
holiday.locale.value_counts()

In [None]:
holidays = holiday.loc[holiday.index.isin(sales_plot.index)]
plt.figure(figsize=(20,6))
plt.plot_date(holidays.index, sales_plot.loc[holidays.index], color='C3')
plt.plot(sales_plot);

In [None]:
plt.figure(figsize=(20,6))
plt.plot_date(holidays.loc['2016'].index, sales_plot.loc[holidays.loc['2016'].index], color='C3')
plt.plot(sales_plot.loc['2016']);

In [None]:
def hol_plot(df, col='C3', lab='N'):
    plt.plot_date(df.loc['2016'].index, sales_plot.loc[df.loc['2016'].index], color=col, label=lab)
    plt.plot(sales_plot.loc['2016'], color='silver')

In [None]:
holiday_nat = holidays.loc[(holidays.locale == 'National') & (holidays.transferred == False) & (holidays.type != 'Work Day')]
holiday_loc = holidays.loc[(holidays.locale == 'Local') & (holidays.transferred == False) & (holidays.type != 'Work Day')]
holiday_reg = holidays.loc[(holidays.locale == 'Regional') & (holidays.transferred == False) & (holidays.type != 'Work Day')]
holiday_trans = holidays.loc[(holidays.transferred == True)]
plt.figure(figsize=(20,6))
hol_plot(holiday_nat, col='orange', lab='national')
hol_plot(holiday_loc, col='black', lab='local')
hol_plot(holiday_reg, col='red', lab='regional')
hol_plot(holiday_trans, col='blue', lab='transferred')
plt.legend()

## 3. Promotion

In [None]:
scale = MinMaxScaler()

In [None]:
plt.figure(figsize=(20,6))
plt.plot(scale.fit_transform(sales_plot.loc['2016'].values), label='sales')
plt.plot(scale.fit_transform(prom.loc['2016'].values), label='prom')
plt.legend();
prom.loc['2016'].corrwith(sales_plot.loc['2016'].sales)

## 4. Transaction

In [None]:
trans = transaction.resample('d').mean()['transactions'].to_frame()
plt.figure(figsize=(20,6))
plt.plot(scale.fit_transform(sales_plot.loc['2016'].values), label='sales')
plt.plot(scale.fit_transform(trans.loc['2016'].values), label='transactions')
plt.legend();
trans.loc['2016'].corrwith(sales_plot.loc['2016'].sales)

##  5. Oil

In [None]:
oils = oil.resample('d').mean().fillna(method='ffill')
plt.figure(figsize=(20,6))
plt.plot(scale.fit_transform(sales_plot.loc['2017'].values), label='sales')
plt.plot(scale.fit_transform(oils.loc['2017'].values), label='oil')
plt.legend();
oils.loc['2017'].corrwith(sales_plot.loc['2017'].sales)

# Make models and features

In [None]:
X_train = df_train.copy()
X_test = df_test.copy()
full_df = pd.concat([X_train, X_test])
full_df.reset_index(drop=True, inplace=True)
X_store = full_df.set_index(['store_nbr', 'family', 'date']).sort_index()
X_prom = (X_store.groupby('date').mean().squeeze())['onpromotion'].to_frame()
av_sales = (X_store.groupby('date').mean().squeeze())['sales'].to_frame()
full_df.head()

In [None]:
# for time features
def creat_time_features(df):
    """
    Creates time series features from datetime index
    """
    df = df.copy()
    #df['trend'] = np.arange(len(df.index))
    #df['trend^2'] = (np.arange(len(df.index)) +1) ** 2
    #df['trend^3'] = (np.arange(len(df.index)) +1) ** 3
    #df['trend^4'] = (np.arange(len(df.index)) +1) ** 4
    #df['trend^5'] = (np.arange(len(df.index)) +1) ** 5
    #df['hour'] = df['date'].dt.hour
    df['dayofweek'] = df.index.day_of_week
    df['weekofyear'] = df.index.week
    df['quarter'] = df.index.quarter
    df['month'] = df.index.month
    #df['year'] = df.index.year
    df['dayofyear'] = df.index.day_of_year
    #df['dayofmonth'] = df.index.day
    df['week_in_month'] = pd.to_numeric(df.index.day/7)
    df['week_in_month'] = df['week_in_month'].apply(lambda x: np.ceil(x))
    X = df.copy()
    return X

In [None]:
def model_cv (X, y):
    cv = TimeSeriesSplit()
    model = LinearRegression()
    result = cross_validate(model, X.loc[:'2017-07'],y.loc[:'2017-07'], cv=cv,scoring=["neg_mean_absolute_error", "neg_mean_squared_log_error"], return_estimator=True)
    mae = -result["test_neg_mean_absolute_error"]
    rmsle = np.sqrt(- result["test_neg_mean_squared_log_error"])
    print('mae = {:.3f}\nrmsle = {:.3f}'.format(mae.mean(), rmsle.mean()))
    X_plt = X.loc['2017-08']
    y_plt = y.loc['2017-08']
    plt.figure(figsize=(20,15))
    for i in result['estimator']:
        y_pred = pd.DataFrame(i.predict(X_plt), index=X_plt.index, columns=['sales'])
        y_pred.plot(color='red')
        y_plt.plot(color='silver')

In [None]:
def X_y(df):
    df = df.copy()
    X = df.loc['2017':].dropna().drop(columns=['sales'])
    y = df.loc['2017':]['sales'].dropna()
    return X,y
    

## Model 1 only date features

In [None]:
X_full = creat_time_features(av_sales)
X, y = X_y(X_full)
X_test = X_full.loc['2017-08-16':].drop(columns=['sales'])
model_cv(X,y);

## Model 2 date features + dummies

In [None]:
X_dum = pd.get_dummies(X_full, columns=['dayofweek', 'week_in_month']).loc['2017': '2017-08-15'].drop(columns=['sales'])
X_test = pd.get_dummies(X_full, columns=['dayofweek', 'week_in_month']).loc['2017-08-16':].drop(columns=['sales'])
model_cv(X_dum,y)

## Model 3 date features + dummies + new_year

In [None]:
X_hol = X_dum.copy()
X_hol['new_year'] = (X_hol.index.dayofyear ==1).astype('int')
X_test['new_year'] = (X_test.index.dayofyear ==1).astype('int')
model_cv(X_hol,y)

## Model 4 date features + dummies + new_year + pay_days

In [None]:
X_pay = X_hol.copy()
X_pay['day_in_m'] = X_pay.index.days_in_month
X_pay['pay_day'] = (X_pay.index.day == 16) | (X_pay.index.day == 1) | (X_pay.index.day == 14) | (X_pay.index.day == X_pay['day_in_m'] - 1) | (X_pay.index.day == 15) | (X_pay.index.day == X_pay['day_in_m'])
X_pay.drop(columns='day_in_m', inplace=True, axis=1)
X_test['day_in_m'] = X_test.index.days_in_month
X_test['pay_day'] = (X_test.index.day == 15) | (X_test.index.day == X_test['day_in_m']) | (X_test.index.day == 16) | (X_test.index.day == 1) | (X_test.index.day == 14) | (X_test.index.day == X_test['day_in_m'] - 1)
X_test.drop(columns='day_in_m', inplace=True, axis=1)
model_cv(X_pay,y)

In [None]:
hol = holiday.loc['2017'].loc[holiday.loc['2017'].locale.isin(['National', 'Regional'])]
hol['hol'] = 1
hol.loc[hol.type == 'Transfer', 'hol'] = 1
hol.loc[(hol.type == 'Holiday') & (hol.transferred == False), 'hol'] = 1
hol.loc[(hol.type == 'Holiday') & (hol.transferred == True ), 'hol'] = 0
hol = pd.get_dummies(hol.drop(columns=['locale','locale_name','description','transferred']), columns=['type'])
X_weakend = X_pay.copy()
X_weakend = pd.concat([X_weakend, hol.loc[:'2017-08-16']], axis=1).fillna(0)
X_weakend.loc[X_weakend.index.dayofweek.isin([5,6]), 'hol'] = 1
X_test = pd.concat([X_test, hol.loc['2017-08-16':'2017-08']], axis=1).fillna(0)
X_test.loc[X_test.index.dayofweek.isin([5,6]), 'hol'] = 1
model_cv(X_weakend,y)

## Add oil

In [None]:
X_oil = X_weakend.copy()
X_oil['oil'] = oils.loc['2015':].rolling(7).mean()
X_test['oil']= oils.loc['2016':'2017-08'].rolling(7).mean()
model_cv(X_oil,y)

## Predict

In [None]:
model = LinearRegression()
X_for_subm = X_oil.loc['2017':]
y_ = X_store.loc[:,:,'2017':].dropna()['sales'].to_frame().unstack(['store_nbr', 'family'])

model.fit(X_for_subm, y_)
y_pred = pd.DataFrame(model.predict(X_for_subm), columns=y_.columns, index=X_for_subm.index)
y_pred[y_pred < 0 ] = 0
plt.figure(figsize=(20,6));
plt.plot(y_.loc(axis=1)['sales',1, 'PRODUCE'].loc['2017'].values)
plt.plot(y_pred.loc(axis=1)['sales',1, 'PRODUCE'].loc['2017'].values)
mae = mean_absolute_error(y_pred.loc(axis=1)['sales',1, 'PRODUCE'].loc['2017'].values,y_.loc(axis=1)['sales',1, 'PRODUCE'].loc['2017'].values)
msle = mean_squared_log_error(y_pred.loc(axis=1)['sales',1, 'PRODUCE'].loc['2017'].values,y_.loc(axis=1)['sales',1, 'PRODUCE'].loc['2017'].values)
print('mae = {:.3f}\nrmsle = {:.3f}'.format(mae, np.sqrt(msle)))

In [None]:
y_submit = pd.DataFrame(model.predict(X_test), columns=y_.columns, index=X_test.index)
y_submit_ = y_submit.stack(['store_nbr', 'family'])
y_submit_.loc[y_submit_.sales < 0] = 0
df_test_ = df_test.set_index(['store_nbr', 'family', 'date']).sort_index()
y_submit_ = y_submit_.join(df_test_.id).reindex(columns=['id', 'sales'])
y_submit_.to_csv('submission_5_lr.csv', index=False)
y_submit_

In [None]:
y_.loc['2017'].stack(['store_nbr', 'family']).groupby('date')['sales'].mean().plot(figsize=(20,6))
y_submit_.groupby('date')['sales'].mean().plot()

## XGB

In [None]:
y_gbm_ = y_.unstack().to_frame() - y_pred.unstack().to_frame()
X_xgb_1 = full_df.set_index(['date']).loc['2017':]
X_xgb = pd.concat([X_xgb_1.loc[:'2017-08-15'], X_oil], axis=1).drop(columns=['id', 'sales'])
X_xgb = X_xgb.set_index(['store_nbr', 'family', X_xgb.index]).sort_index()

In [None]:
X_gbm = X_store.loc[:,:,'2017':'2017-08-16'].drop(columns=['id', 'sales'])
model_2 = XGBRegressor()
model_2.fit(X_gbm, y_gbm_)

gbm_pred = pd.DataFrame(model_2.predict(X_gbm), index=X_gbm.index, columns=['sales'])
y_boost = pd.DataFrame( y_pred.unstack().to_frame().values + gbm_pred.values, index=gbm_pred.index, columns=['sales'] )
y_boost.loc[y_boost.sales < 0] = 0

In [None]:
y_pred = y_boost.copy()
plt.figure(figsize=(20,6));
plt.plot(y_.loc(axis=1)['sales',1, 'PRODUCE'].loc['2017'].values)
plt.plot(y_pred.loc(axis=0)[1, 'PRODUCE'].loc['2017'].values)
mae = mean_absolute_error(y_pred.loc(axis=0)[1, 'PRODUCE'].loc['2017'].values,y_.loc(axis=1)['sales',1, 'PRODUCE'].loc['2017'].values)
msle = mean_squared_log_error(y_pred.loc(axis=0)[1, 'PRODUCE'].loc['2017'].values,y_.loc(axis=1)['sales',1, 'PRODUCE'].loc['2017'].values)
print('mae = {:.3f}\nrmsle = {:.3f}'.format(mae, np.sqrt(msle)))

In [None]:
#X_test_xgb = pd.concat([X_xgb_1.loc['2017-08-16':], X_test], axis=1).drop(columns=['id', 'sales'])
#X_test_xgb = X_test_xgb.set_index(['store_nbr', 'family', X_test_xgb.index]).sort_index()

In [None]:
X_test_xgb = df_test_.onpromotion.to_frame().sort_index()

In [None]:
X_test_xgb

In [None]:
gbm_pred_2 = pd.DataFrame(model_2.predict(X_test_xgb), index=y_submit_.index, columns=['sales'])
y_boost_2 = pd.DataFrame(y_submit.unstack().to_frame().values + gbm_pred_2.values, index=gbm_pred_2.index, columns=['sales'])

In [None]:
y_submit2_ = (0.8 * y_submit_.drop(columns=['id']) + gbm_pred_2 * 0.2) * 1.05
y_submit2_.loc[y_submit2_.sales < 0] = 0
y_submit2_ = y_submit2_.join(df_test_.id).reindex(columns=['id', 'sales'])
y_submit2_.to_csv('submission_5_xgb.csv', index=False)
y_submit2_

In [None]:
y_pred.groupby('date').mean().plot(figsize=(20,5))
y_submit2_.groupby('date')['sales'].mean().plot()