In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import random

from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor

import time
import os

from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess, Fourier

import optuna
from optuna.samplers import TPESampler


from joblib import Parallel, delayed
import warnings

from path import Path

[[](http://)](http://)

<font size="4"> This kernel is based on https://www.kaggle.com/xholisilemantshongo/modeling-sales-3-types-of-regression, so if you like my notebook please upvote his notebook too. I could improve the PL by using different dataframes for training Ridge and  RandomForestRegressor and by using optuna to optimize hyperparameters. </font>

## Seed

In [None]:
def set_seed(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

## Paths

In [None]:
path = Path('../input/store-sales-time-series-forecasting')

## Creating common dataframes

In [None]:
def get_calendar():

    events_df = create_events_df()
 

    calendar = pd.DataFrame({"date" : pd.date_range('2013-01-01', '2017-08-31')} )
    
    calendar = add_events_df_columns(calendar)      
    calendar = add_dofw_column(calendar)
    calendar = add_wd_column(calendar)
    calendar = add_dcoilwtico_column(calendar)
    calendar = add_rolling_mean_oil_column(calendar, 7)

    
    calendar["date"] = calendar.date.dt.to_period('D')
    calendar = calendar.set_index("date")
    
    
    calendar = calendar[ ["dofw", "type", "wd",  "dcoilwtico", "oil_ma7"] ]
    calendar["type"] = calendar["type"].fillna("None")
    
 
    return calendar




def get_y():
    df_train = pd.read_csv(path / 'train.csv',
                           usecols=['store_nbr', 'family', 'date', 'sales'],
                           dtype={'store_nbr': 'category', 'family': 'category', 'sales': 'float32'},
                           parse_dates=['date'], infer_datetime_format=True)
    df_train.date = df_train.date.dt.to_period('D')
    df_train = df_train.set_index(['store_nbr', 'family', 'date']).sort_index()

    y = df_train.unstack(['store_nbr', 'family'])
    
    
    return y




def add_events_df_columns(calendar):
    events_df = create_events_df()
    calendar = calendar.merge(events_df, on="date", how='left')
    
    return calendar



def add_dcoilwtico_column(calendar):
    oil_df = create_oil_df()
    calendar = calendar.merge(oil_df, on="date", how="left")
    calendar["dcoilwtico"] = calendar["dcoilwtico"].fillna(method="ffill")

    return calendar
    

def add_rolling_mean_oil_column(calendar, num):
    calendar[f"oil_ma{num}"]  = calendar['dcoilwtico'].rolling(num).mean()
    calendar[f"oil_ma{num}"]  = calendar[f"oil_ma{num}"].fillna(method="ffill")
    
    return calendar



def add_dofw_column(calendar):
    calendar['dofw'] = calendar["date"].apply(lambda x: x.dayofweek)
    return calendar


def add_wd_column(calendar):
    calendar['wd'] = True

    calendar.loc[calendar.dofw > 4, 'wd'] = False
    calendar.loc[calendar.type == 'Bridge'  , 'wd'] = False
    calendar.loc[calendar.type == 'Work Day', 'wd'] = True
    calendar.loc[calendar.type == 'Transfer', 'wd'] = False
    calendar.loc[(calendar.type == 'Holiday') & (calendar.transferred == False), 'wd'] = False
    calendar.loc[(calendar.type == 'Holiday') & (calendar.transferred == True ), 'wd'] = True

    
    return calendar

    
    

def fill_na(calendar):
    calendar["type"] = calendar["type"].fillna("None")
    return calendar
 

def create_oil_df():
    oil_df = pd.read_csv(path / 'oil.csv', parse_dates=['date'], infer_datetime_format=True)
    return oil_df






def create_events_df():
    events_df = pd.read_csv(path / 'holidays_events.csv', parse_dates=['date'], infer_datetime_format=True)
    events_df['date'] = events_df['date'].replace({'2013-04-29' : 
                                                 pd.to_datetime('2013-03-29')}) # 'Good Friday' mistake correction

    events_df = events_df.sort_values(by="date")         
    events_df = events_df[events_df.locale == 'National'] 
    events_df = events_df.groupby(by="date").first() 

    
    return events_df

## Model

In [None]:
class CustomRegressor():

    def __init__(self, n_jobs=-1, verbose=0, alpha=0.6):

        self.n_jobs = n_jobs
        self.verbose = verbose

        self.estimators_ = None
            
        self.alpha = alpha

    def _estimator_(self, X_ridge, X_rf, y):

        warnings.simplefilter(action='ignore', category=FutureWarning)

        if y.name[2] == 'SCHOOL AND OFFICE SUPPLIES':
            model = RandomForestRegressor(n_estimators = 300, n_jobs=-1, random_state=1)
            X = X_rf
            choice = 0
        else:
            model = Ridge(fit_intercept=True, solver='auto', alpha=self.alpha, normalize=True)
            X = X_ridge
            choice = 1

        model.fit(X, y)

        
        return model, choice

    def fit(self, X_ridge, X_rf, y):

        self.estimators_ =  Parallel(n_jobs=self.n_jobs, 
                                  verbose=self.verbose,
                                  )(delayed(self._estimator_)(X_ridge, X_rf, y.iloc[:, i]) for i in range(y.shape[1]))

        return

    def predict(self, X_ridge, X_rf):
        X = [X_rf, X_ridge]
        

        y_pred = Parallel(n_jobs=self.n_jobs, 
                              verbose=self.verbose)(delayed(self.estimators_[i][0].predict)(X[self.estimators_[i][1]])  for i in range(len(self.estimators_)))
        
        return np.stack(y_pred, axis=1)

## Training

In [None]:
def get_ridge_full(order):
    fourier = CalendarFourier(freq='W', order=order)
    dp = DeterministicProcess(index=df.index,
                              constant=False,
                              order=1,
                              seasonal=False,
                              additional_terms=[fourier],
                              drop=True)

    X = dp.in_sample()
    X_ridge_full = X_ridge.copy()
    
    for c in X.columns:
        X_ridge_full[c] = X[c].values
        

    
    return X_ridge_full


def get_rf_full():
    fourier = CalendarFourier(freq='W', order=0)
    dp = DeterministicProcess(index=df.index,
                              constant=False,
                              order=1,
                              seasonal=False,
                              additional_terms=[fourier],
                              drop=True)

    X = dp.in_sample()
    
    
    X_rf_full = X_rf.copy()
    
    for c in X.columns:
        X_rf_full[c] = X[c].values
    
    
    return X_rf_full


def add_month_column(X, dummies=True):
    X["month"] = [x.month for x in X.index]
    if dummies:
        X = pd.get_dummies(X, columns=['month'], drop_first=False)
        
    return X


def add_season_column(X, dummies=True):
    X["season"] = [x.month // 3 for x in X.index]
    X = pd.get_dummies(X, columns=['season'], drop_first=False)
    
    return X



train_start = '2017-04-15'
train_end = '2017-08-15'


test_start = '2017-08-16'
test_end = '2017-08-31'

order = 3
add_rolling_mean_14 = 1
add_rolling_mean_30 = 0
add_month = 0
add_season = 1
alpha = 1.125
    


df = get_calendar()
y = get_y()

y = y.loc['2017-01-01':]           
df = df.loc['2017-01-01':]


le = LabelEncoder()
X_rf = df[["dofw", "wd", "dcoilwtico", "type", "oil_ma7"]].copy()
X_rf["type"] = le.fit_transform(X_rf["type"])



X_ridge = df[["dofw", "wd", "dcoilwtico", 'type', "oil_ma7"]].copy()
X_ridge = pd.get_dummies(X_ridge, columns=['dofw'], drop_first=True)
X_ridge = pd.get_dummies(X_ridge, columns=['type'])
X_ridge = X_ridge.drop(["type_None"], axis=1)


X_ridge_full = get_ridge_full(order)
X_rf_full = get_rf_full()
    
        
if add_rolling_mean_14:
    X_ridge_full = add_rolling_mean_oil_column(X_ridge_full, 14)
    X_rf_full = add_rolling_mean_oil_column(X_rf_full, 14)
        
        
if add_rolling_mean_30:
    X_ridge_full = add_rolling_mean_oil_column(X_ridge_full, 30)
    X_rf_full = add_rolling_mean_oil_column(X_rf_full, 30)
        
if add_month:
    X_ridge_full = add_month_column(X_ridge_full, dummies=True)
    X_rf_full = add_month_column(X_rf_full)
    
if add_season:
    X_ridge_full = add_season_column(X_ridge_full, dummies=True)
    X_rf_full = add_season_column(X_rf_full)
    
    
X_ridge_full.drop(["dcoilwtico"], axis=1, inplace=True)
X_rf_full.drop(["dcoilwtico"], axis=1, inplace=True)
    


X_ridge_full_train = X_ridge_full.loc[train_start:train_end]
X_rf_full_train = X_rf_full.loc[train_start:train_end]
    
y_train = y.loc[train_start:train_end]

    
X_ridge_full_test = X_ridge_full.loc[test_start:test_end]
X_rf_full_test = X_rf_full.loc[test_start:test_end]


In [None]:
model = CustomRegressor(n_jobs=-1, verbose=0, alpha=alpha)
model.fit(X_ridge_full_train, X_rf_full_train, y_train)

## Submission

In [None]:
y_pred = model.predict(X_ridge_full_test, X_rf_full_test)

In [None]:
y_pred = pd.DataFrame(y_pred, index=df.loc[test_start:test_end].index, columns=y.columns)
y_pred = y_pred.stack(['store_nbr', 'family'])
y_pred[y_pred < 0] = 0. 

submission = pd.read_csv(path / 'sample_submission.csv', index_col='id')
submission.sales = y_pred.values
submission.to_csv('submission.csv', index=True)

In [None]:
submission