In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import re
import xgboost

from pandas.api.types import CategoricalDtype

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from statsmodels.graphics.tsaplots import plot_pacf
from xgboost import XGBRegressor
import lightgbm
from sklearn.model_selection import GridSearchCV
import joblib
import logging
# import keras


In [3]:
all_csv = pd.read_csv('cleaned.csv.gz',
                 dtype = {
                     'store_nbr' : 'category',
                     'family' : 'category',
                     'sales': 'float',
                     'city': 'category',
                     'state': 'category',
                     'type': 'category',
                     'holiday_type': 'category',
                     'holiday_transferred': 'category'
                 },
                  parse_dates=['date'])
all_csv['date'] = pd.to_datetime(all_csv['date']).dt.to_period('D')


In [4]:
all = all_csv.copy()  # we can start experimenting from here without reloading the csv file


In [5]:
# this is for experimentation

filter_by_stores = None  # note: please use string here (unlike Mine.ipynb)
filter_by_family = None
filter_by_dates = None

#filter_by_stores = ['1', '2']  # note: please use string here (unlike Mine.ipynb)
#filter_by_family = ['DAIRY', 'PRODUCE']
#filter_by_family = ['']
#filter_by_dates = '2014-06-05'


In [6]:
if filter_by_dates == None:
    train_start_date = '2013-01-01'
else:
    train_start_date = filter_by_dates
train_end_date = '2017-08-15'
test_start_date = '2017-08-16'
test_end_date = '2017-08-31'


In [7]:
if filter_by_family != None:
    all = all[all['family'].isin(filter_by_family)]
if filter_by_stores != None:
    all = all[all['store_nbr'].isin(filter_by_stores)]
if filter_by_dates != None:
    all = all[all['date'] >= filter_by_dates]

In [8]:
all.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3036528 entries, 0 to 3036527
Data columns (total 38 columns):
 #   Column               Dtype    
---  ------               -----    
 0   date                 period[D]
 1   store_nbr            category 
 2   family               category 
 3   sales                float64  
 4   onpromotion          int64    
 5   sales_lag_01         float64  
 6   sales_lag_02         float64  
 7   sales_lag_03         float64  
 8   sales_lag_04         float64  
 9   sales_lag_05         float64  
 10  sales_lag_06         float64  
 11  sales_lag_07         float64  
 12  sales_lag_08         float64  
 13  sales_lag_09         float64  
 14  sales_lag_10         float64  
 15  sales_lag_11         float64  
 16  sales_lag_12         float64  
 17  sales_lag_13         float64  
 18  sales_lag_14         float64  
 19  sales_lag_15         float64  
 20  sales_lag_16         float64  
 21  sales_lag_17         float64  
 22  sales_lag_18      

In [9]:
def one_hot_encode(df):
    return pd.get_dummies(data=df, columns=['store_nbr', 'family', 'city', 'state', 'type',
                                     'cluster', 'holiday_type', 'holiday_transferred', 'weekday']) 

In [10]:
all_ohe = one_hot_encode(all)
all_ohe = all_ohe.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))  # remove bad char in column names

X = all_ohe[all_ohe['date'] <= train_end_date]
X = X.drop(['sales'], axis=1)
y = all_ohe[['date', 'sales']][all_ohe['date'] <= train_end_date]
y.set_index('date', inplace=True)

X_test = all_ohe[all_ohe['date'] >= test_start_date]
X_test = X_test.drop(['sales'], axis=1)

X.drop('date', axis=1, inplace=True)
X_test.drop('date', axis=1, inplace=True)
y.set_index(X.index, inplace=True)

# from sklearn.decomposition import PCA
# pca = PCA(n_components=180)
# principalComponents = pca.fit_transform(X)
# principalDf = pd.DataFrame(data = principalComponents)

X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=1)

**SVM Regressor**

In [None]:
from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor

model_svr = MultiOutputRegressor(SVR(C = 0.2, kernel = 'rbf'), n_jobs = -1)
model_svr.fit(X_train, y_train)

MultiOutputRegressor(estimator=SVR(C=0.2), n_jobs=-1)

In [10]:
y_pred_train = model_svr.predict(X_train)
y_pred_train[y_pred_train < 0] = 0
y_pred_val = model_svr.predict(X_val)
y_pred_val[y_pred_val < 0] = 0

In [12]:
print("RMS log-error train: ", np.sqrt(mean_squared_log_error(y_train, y_pred_train)))
print("RMS log-error val: ", np.sqrt(mean_squared_log_error(y_val, y_pred_val)))
print("RMS log-error train (actual): ",
          np.sqrt(mean_squared_log_error(np.expm1(y_train), np.expm1(y_pred_train))))
print("RMS log-error val (actual): ",
          np.sqrt(mean_squared_log_error(np.expm1(y_val), np.expm1(y_pred_val))))

RMS log-error train:  0.3722857209983476
RMS log-error val:  0.3892237332423023
RMS log-error train (actual):  0.5323982641064017
RMS log-error val (actual):  0.5629841092840163


In [13]:
def main_predict(model, X_test):
    X_test_mod = X_test.copy()
    output = np.array([])
    start_day, end_day = X_test['day_of_month'].min(), X_test['day_of_month'].max()
        # we lost the dates, but we still have day_of_month, which is good enough for our experiment
        
    for day in range(start_day, end_day + 1):
        pred = model.predict(X_test_mod[X_test_mod['day_of_month'] == day])
        pred[pred < 0] = 0
        print(pred)
        output = np.concatenate([output, pred], axis=0)
        for future in range(day + 1, end_day + 1):
            X_test_mod.loc[X_test_mod[X_test_mod['day_of_month'] == future].index,
                           f'sales_lag_{(future - day):02d}'] = pred
            # fill out future values now that this sales figure is available
            
    return output


In [11]:
y_pred_test = main_predict(model_svr, X_test)

[0.11967785 2.38872427 0.65663952 ... 1.29324115 2.76654721 0.95673513]
[1.12141818 1.87616722 1.41617273 ... 2.88968327 0.62116203 1.67497799]
[2.87934372 2.03320055 1.37923162 ... 0.92270943 0.23353092 1.33441761]
[1.04962632 3.19366553 1.41707919 ... 0.10716022 0.47267722 0.94505277]
[1.08822059 1.61641118 1.23518137 ... 2.01308692 2.82727697 3.14590217]
[0.90681655 2.08577696 1.05045321 ... 1.06011914 2.45407819 0.70312524]
[3.15560875 0.00339373 2.20785042 ... 1.81735793 3.331251   3.38355634]
[0.49802095 0.31372057 2.95853658 ... 1.05715435 0.53881925 0.1964043 ]
[0.54823515 2.41975598 1.06252407 ... 1.64595386 1.8277473  1.08556045]
[2.05839378 3.42893869 0.08696091 ... 2.20040369 2.61810641 0.62928289]
[1.4156125  1.95259127 1.0152039  ... 0.73285419 2.23427956 0.00690789]
[0.10254338 1.24641745 2.9730511  ... 0.21490646 2.88382405 0.07270278]
[2.58955172 0.41465833 2.57284581 ... 2.76483209 2.46913725 3.02756179]
[0.7787405  2.58963486 2.68050365 ... 0.21723778 0.57668876 1.85

In [14]:
delta_index = 3008016 - 3000888  # we inserted 4 Christmas days, 4 x 54 x 33 = 7128, which is the difference
submission = pd.DataFrame({'id': X_test.index - delta_index, 'sales': np.expm1(y_pred_test)})
submission.to_csv('submission_svr.csv', index=False)

## Importing RF model

In [25]:
import pickle
# load
with open('model_rf.pkl', 'rb') as f:
    model_rf = pickle.load(f)


## Importing XGBoost

In [26]:
import pickle
# load
with open('model_xgb.pkl', 'rb') as f:
    model_xgb = pickle.load(f)

## Ensemble Method

In [29]:
from sklearn.ensemble import VotingRegressor, ExtraTreesRegressor, BaggingRegressor, RandomForestRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_log_error
import warnings
from joblib import Parallel, delayed

In [30]:
class votingEnsembleRegression():
    
    def __init__(self, n_jobs=-1, verbose=0):    
        self.n_jobs = n_jobs
        self.verbose = verbose
        self.estimators_ = None
    
    def estimator(self, X_train, y_train):
    
        warnings.simplefilter(action='ignore', category=FutureWarning)
 
        ridge = Ridge(fit_intercept=True, solver='auto', alpha=0.75, normalize=True, random_state=5)
        svr = model_svr
        rf=model_rf
        xgb=model_xgb
        model1 = VotingRegressor([('ridge', ridge), ('svr', svr)]) 
        model2= VotingRegressor([('rf', rf), ('xgb', xgb)]) 
        model=VotingRegressor([('m1', model1), ('m2', model2)]) 
        model.fit(X_train, y_train)

        return model
    
    def fit(self,X_train, y_train):
        self.estimators = Parallel(n_jobs=self.n_jobs, verbose=0,)(delayed(self.estimator)(X_train, y_train.iloc[:, i]) for i in range(y_train.shape[1]))
        return
    
    def predict(self, X_test):
        y_pred = Parallel(n_jobs=self.n_jobs,verbose=0)(delayed(e.predict)(X_test) for e in self.estimators)
        return np.stack(y_pred, axis=1)

In [31]:
votingEnsembleRegression_model=votingEnsembleRegression()
votingEnsembleRegression_model.fit(X_train, y_train)

**Evaluation**

In [32]:
y_pred_train = votingEnsembleRegression_model.predict(X_train)
y_pred_train[y_pred_train < 0] = 0
y_pred_val = votingEnsembleRegression_model.predict(X_val)
y_pred_val[y_pred_val < 0] = 0

In [33]:
print("RMS log-error train: ", np.sqrt(mean_squared_log_error(y_train, y_pred_train)))
print("RMS log-error val: ", np.sqrt(mean_squared_log_error(y_val, y_pred_val)))
print("RMS log-error train (actual): ",
          np.sqrt(mean_squared_log_error(np.expm1(y_train), np.expm1(y_pred_train))))
print("RMS log-error val (actual): ",
          np.sqrt(mean_squared_log_error(np.expm1(y_val), np.expm1(y_pred_val))))

RMS log-error train:  0.14742841052730962
RMS log-error val:  0.14552198259112045
RMS log-error train (actual):  0.42161915209476136
RMS log-error val (actual):  0.41084028914506504


In [34]:
y_pred_test = main_predict(votingEnsembleRegression_model, X_test)

[0.19834369 2.15970899 1.27344995 ... 2.98216513 3.48104022 1.64701719]
[0.17493266 1.66993516 2.47091652 ... 2.3348894  1.50301823 3.01084877]
[2.74874313 3.05522606 2.95919066 ... 2.90529039 3.36833467 3.17361386]
[2.22088813 3.43762715 2.87835876 ... 1.54481341 1.96484082 0.61985623]
[0.85711875 1.83428036 2.04581065 ... 0.65215971 2.64351976 3.17494882]
[1.11427783 3.45426144 0.3244539  ... 1.6481821  1.02870956 0.594262  ]
[2.50112051 1.81070405 0.03567776 ... 1.46441356 3.36591058 0.76049881]
[2.27222414 2.01976616 2.35518705 ... 2.10879497 1.76562312 1.37300042]
[1.85170051 3.11887359 0.3067603  ... 0.37602163 2.44514466 0.04696336]
[3.36407454 0.70153053 2.40365351 ... 2.68416588 3.00568999 2.71829736]
[2.39724729 3.48764851 2.27876124 ... 2.92905129 1.53056496 1.98109757]
[3.18436059 1.602517   3.18718923 ... 2.96286022 3.22221297 1.74866862]
[0.19949044 0.76450777 0.81486609 ... 3.25336479 3.16483495 0.43576415]
[2.08763877 0.08984864 0.78854499 ... 1.0205013  0.13874434 1.15

In [35]:
delta_index = 3008016 - 3000888  # we inserted 4 Christmas days, 4 x 54 x 33 = 7128, which is the difference
submission = pd.DataFrame({'id': X_test.index - delta_index, 'sales': np.expm1(y_pred_test)})
submission.to_csv('submission_ensemble.csv', index=False)

In [36]:
!!jupyter nbconvert FinalEnsembleModel.ipynb --to python

['[NbConvertApp] Converting notebook FinalEnsembleModel.ipynb to python',
 '[NbConvertApp] Writing 8342 bytes to FinalEnsembleModel.py']

In [37]:
!!jupyter nbconvert FinalEnsembleModel.ipynb --to html

['[NbConvertApp] Converting notebook FinalEnsembleModel.ipynb to html',
 '[NbConvertApp] Writing 324541 bytes to FinalEnsembleModel.html']