In [2]:
import os
from datetime import datetime

import pandas as pd
import numpy as np

from xgboost import XGBRegressor

from sklearn.utils import all_estimators
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV

from keras.models import Sequential
from keras.layers import LSTM, Dense

import sys
sys.path.insert(0, "../scripts")

from feature_extractor import FeatureExtractor

import warnings
warnings.filterwarnings('ignore')

In [3]:
DATA_PATH = '../data/raw/pinheiro'

df_sales = pd.read_csv(os.path.join(DATA_PATH, 'sales.csv'))
df_sales = df_sales.groupby(['date', 'product_code'])['unit_sales'].mean().reset_index()
df_sales = df_sales[df_sales['date'] < '2019-12-01']

df = df_sales[df_sales['product_code'] == 104649]
df.rename(columns={
    'unit_sales': 'y'
}, inplace=True)
df['baseline'] = df['y'].rolling(7).mean()

In [4]:
start_date = datetime(2018, 1, 1)

fe = FeatureExtractor(df, start_date)
fe.extract(n_days=6, n_weeks=7)
features = fe.features

In [5]:
df_on_period = df[df['date'].isin(fe.date_list)]

X = features['autoregressive'].values
y = df[df['date'].isin(fe.date_list)]['y'].values.reshape(-1, 1)
y_baseline = df_on_period['baseline'].values.reshape(-1, 1)

In [6]:
train_ratio = 0.8
train_size = int(train_ratio*len(df_on_period))

In [7]:
X_train, y_train = X[:train_size], y[:train_size]
X_test, y_test = X[train_size:], y[train_size:]

y_baseline_train = y_baseline[train_size:]
y_baseline_test = y_baseline[train_size:]

In [8]:
X_train.shape, y_train.shape

((559, 13), (559, 1))

In [9]:
X_test.shape, y_test.shape

((140, 13), (140, 1))

In [10]:
n_steps = X.shape[1]
n_features = y.shape[1]

In [11]:
def get_LSTM(n_steps, n_features):
    model = Sequential()
    model.add(LSTM(50, activation='relu', input_shape=(n_steps, n_features)))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mse')
            
    return {
        'LSTM': model
    }

In [12]:
# Scikit-learn models
estimators = all_estimators(type_filter='regressor')

all_regs = {}
for name, RegressorClass in estimators:
    try:
#         print('Appending', name)
        reg = RegressorClass()
        all_regs[name] = reg
    except Exception as e:
        print(e)
        
        
all_regs.update({
    XGBRegressor.__name__: XGBRegressor()
})

# print('Appending', XGBRegressor.__name__)

all_regs.update(get_LSTM(n_steps, n_features))
# print('Appending', 'LSTM')

__init__() missing 1 required positional argument: 'estimator'
__init__() missing 1 required positional argument: 'base_estimator'
__init__() missing 1 required positional argument: 'estimators'
__init__() missing 1 required positional argument: 'estimators'


In [13]:
def eval_pred(y, y_pred):
    return mean_absolute_error(y, y_pred)/np.mean(y)

In [33]:
performances = {}
for model_name, model in all_regs.items():
    try:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        performances[model_name] = eval_pred(y_test, y_pred)
    except:
        print(f"Error on model {model_name}")

Error on model GammaRegressor
Error on model IsotonicRegression
Error on model RadiusNeighborsRegressor


---

## GridSearch

In [35]:
# Number of trees in Random Forest
rf_n_estimators = [int(x) for x in np.linspace(200, 1000, 5)]
rf_n_estimators.append(1500)
rf_n_estimators.append(2000)

# Maximum number of levels in tree
rf_max_depth = [int(x) for x in np.linspace(5, 55, 6)]
# Add the default as a possible value
rf_max_depth.append(None)

# Criterion to split on
rf_criterion = ['mse', 'mae']

# Method of selecting samples for training each tree
rf_bootstrap = [True, False]

# Create the grid
rf_grid = {'n_estimators': rf_n_estimators,
           'max_depth': rf_max_depth,
           'criterion': rf_criterion,
           'bootstrap': rf_bootstrap}

In [36]:
rf_grid

{'n_estimators': [200, 400, 600, 800, 1000, 1500, 2000],
 'max_depth': [5, 15, 25, 35, 45, 55, None],
 'criterion': ['mse', 'mae'],
 'bootstrap': [True, False]}

In [37]:
# model = all_regs['RandomForestRegressor']

# grid = RandomizedSearchCV(
#     estimator=model,
#     param_distributions=rf_grid,
#     n_iter=5,
#     cv=3,
#     verbose=2,
#     random_state=42,
#     n_jobs=-1)

# grid.fit(X_train, y_train)

In [14]:
model = all_regs['RandomForestRegressor']

In [52]:
model = list(all_regs.values())[0]

In [57]:
model_name = list(all_regs.keys())[0]

In [58]:
model_name

'ARDRegression'

In [55]:
len(list(all_regs.values()))

52

In [None]:
alpha_1float, default=1e-6
Hyper-parameter : shape parameter for the Gamma distribution prior over the alpha parameter.

alpha_2float, default=1e-6
Hyper-parameter : inverse scale parameter (rate parameter) for the Gamma distribution prior over the alpha parameter.

lambda_1float, default=1e-6
Hyper-parameter : shape parameter for the Gamma distribution prior over the lambda parameter.

lambda_2float, default=1e-6
Hyper-parameter : inverse scale parameter (rate parameter) for the Gamma distribution prior over the lambda parameter.

In [62]:
hyper_parameters_list = [
    'alpha_1',
    'alpha_2',
    'lambda_1',
    'lambda_2'
]

In [15]:
params = model.get_params()

In [16]:
params

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [63]:
hyper_parameters = {k: params[k] for k in hyper_parameters_list}

In [67]:
a = hyper_parameters['alpha_1']

In [68]:
type(a)

float

In [70]:
0.25*a

2.5e-07

In [72]:
model_name

'ARDRegression'

In [None]:
g1 = {
    'alpha_1': ,
    'alpha_2',
    'lambda_1',
    'lambda_2' 
}