In [70]:
import pandas as pd
import numpy as np
import re

from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.utils import shuffle

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

seed = 20191194

In [2]:
df = pd.read_csv('case_study_data_ver1.csv', sep = ';')

In [3]:
df = df.dropna() # getting rid of all NULLs

In [4]:
df['QTY'] = df['QTY'].astype(str)
outtakes = df[df.QTY.str.contains('[a-zA-Z]|\.\d\d$|\.\d\d\d\d$')]
df = df.drop(outtakes.index)
df['QTY'] = df['QTY'].apply(lambda x: re.sub('\.','',re.sub('\..$','',x))).astype(float)
df = df.drop([key for key in list(df.keys()) if df[key].sum()==0], axis=1)

In [5]:
df = df[(df.STORE_ID==3)&(df.ITEM_ID==4)]

In [45]:
# add additional features last x days (This step costs data, critical when there isnt much)
# here we could distinguish: if x <= 5% of data points, use additional features, else not
last_x_days = 7
prev_qty = {}
for num in range(1,last_x_days+1):
    prev_qty['qty_prev_{}d'.format(num)] = [df.QTY.values[i-num] for i in range(len(df))]
    
additional_features = pd.DataFrame(prev_qty)
additional_features.head()

Unnamed: 0,qty_prev_1d,qty_prev_2d,qty_prev_3d,qty_prev_4d,qty_prev_5d,qty_prev_6d,qty_prev_7d
0,10.0,48.0,21.0,7.0,12.0,7.0,6.0
1,20.0,10.0,48.0,21.0,7.0,12.0,7.0
2,7.0,20.0,10.0,48.0,21.0,7.0,12.0
3,4.0,7.0,20.0,10.0,48.0,21.0,7.0
4,8.0,4.0,7.0,20.0,10.0,48.0,21.0


In [47]:
#merge it
for key in list(additional_features.keys()):
    df['{}'.format(key)]=additional_features['{}'.format(key)].values

df.head()

Unnamed: 0,ITEM_ID,DATE_ID,STORE_ID,CAT_DAYOFWEEK,DUM_FREEWEEKDAY,DUM_WORKSATURDAY,DUM_SPRINGBREAK,DUM_SUMMERBREAK,DUM_FALLBREAK,DUM_WINTERBREAK,...,QTY,PRICE,CUSTOMER,qty_prev_1d,qty_prev_2d,qty_prev_3d,qty_prev_4d,qty_prev_5d,qty_prev_6d,qty_prev_7d
97,4.0,1.0,3.0,5,0,0,0,0,0,1,...,20.0,1101.45,1607.0,10.0,48.0,21.0,7.0,12.0,7.0,6.0
681,4.0,3.0,3.0,0,0,0,0,0,0,0,...,7.0,1101.45,1498.0,20.0,10.0,48.0,21.0,7.0,12.0,7.0
1192,4.0,4.0,3.0,1,0,0,0,0,0,0,...,4.0,1101.45,1378.0,7.0,20.0,10.0,48.0,21.0,7.0,12.0
1790,4.0,5.0,3.0,2,0,0,0,0,0,0,...,8.0,1101.45,1304.0,4.0,7.0,20.0,10.0,48.0,21.0,7.0
2431,4.0,6.0,3.0,3,0,0,0,0,0,0,...,7.0,1101.45,1538.0,8.0,4.0,7.0,20.0,10.0,48.0,21.0


In [55]:
# Exlude first x days since additional features are actually showing the last days^^
df=df.drop(df[:last_x_days].index)

1149
1142


In [56]:
# Exclude: 
# - ID's, 
# - what we can't know at prediction time 
features = [col for col in list(df.keys()) if (col not in list(df.keys())[:3]) & \
                                              (col not in ['QTY', 'CUSTOMER'])]

In [57]:
# Split training & new data (7 days)
df_validation = df[df.DATE_ID >= df.DATE_ID.max()-7]
df_training = df.drop(df_validation.index)

In [None]:
X,y = shuffle(df_training, np.log(df_training.QTY), random_state = seed) #np.log(df.QTY)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = seed)

In [98]:
#Training base models
models = {'GradientBoostingRegressor': GradientBoostingRegressor(random_state=seed),
          'RandomForestRegressor': RandomForestRegressor(random_state=seed)}

models_trained = {}
for model in models:
    models_trained[model+'_base'] = models[model].fit(X_train[features],y_train)

In [85]:
def evaluate(model_name, trained_model, test_features, test_labels, prnt = False):
    predictions = trained_model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    if prnt == True:
        print('{} Performance'.format(model_name))
        print('Mean Absolute Error: {:0.4f}'.format(np.mean(errors)))
        print('Accuracy = {:0.2f}%\n'.format(accuracy))
        
    return model_name, round(np.mean(errors),4) , round(accuracy,2)

In [101]:
#Saving performance measures for comparison
performance = {}
for model in models_trained:
    evaluation = evaluate(model, models_trained[model] , X_test[features], y_test)
    performance[evaluation[0]] = [evaluation[1],evaluation[2]]

In [102]:
#Tuning loop with
params_list = []
params_grid_gbr = {
    'loss':['ls','lad', 'huber', 'quantile'],
    'learning_rate ':
}

SyntaxError: invalid syntax (<ipython-input-102-3201c8185414>, line 6)

In [111]:
list(np.arange(0.001, 0.01, 0.001))

[0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009000000000000001]

In [113]:
np.linspace(0.001,0.01,10)

array([0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009,
       0.01 ])