In [1]:
import pandas as pd
import seaborn as sns
import warnings
import pickle
import random
from matplotlib import pyplot as plt
from IPython.core.display import HTML
from scipy import stats as ss

warnings.filterwarnings( 'ignore' )

In [2]:
%matplotlib inline
%pylab inline

plt.style.use( 'bmh' )
plt.rcParams['figure.figsize'] = [25, 12]
plt.rcParams['font.size'] = 10

Populating the interactive namespace from numpy and matplotlib


In [3]:
# add "/content/drive/MyDrive/dsp/" for colab

with open('code7_x_training.pkl', 'rb') as f:
    x_training = pickle.load(f)
with open('code7_x_train.pkl', 'rb') as f:
    x_train = pickle.load(f)
with open('code7_x_test.pkl', 'rb') as f:
    x_test = pickle.load(f)
with open('code6_Y_train.pkl', 'rb') as f:
    y_train = pickle.load(f)
with open('code6_Y_test.pkl', 'rb') as f:
    y_test = pickle.load(f)

In [9]:
def cross_validation(model_name,x_training,kfold,model,verbose=False):
  mae_list=[]
  mape_list=[]
  rmse_list=[]

  for k in range(kfold,0,-1):
    if verbose:
      print('\nKfold number: {}'.format(k))
    val_start_date=x_training['date'].max()- datetime.timedelta(days=k*6*7)
    val_end_date=x_training['date'].max()- datetime.timedelta(days=(k-1)*6*7)

    #filtering dataset
    training = x_training[x_training['date'] < val_start_date]
    validation = x_training[(x_training['date'] >= val_start_date) & (x_training['date'] >= val_end_date)]

    # training and validation dataset
    xtraining=training.drop(['date','sales'],axis=1)
    ytraining=training['sales']
    xvalidation=validation.drop(['date','sales'],axis=1)
    yvalidation=validation['sales']

    #model
    m=model.fit(xtraining,ytraining)

    #prediction
    yhat=m.predict(xvalidation)

    #performance
    m_result=ml_error(model_name,np.expm1(yvalidation),np.expm1(yhat))

    #store performance for each kfold iteration
    mae_list.append(m_result['MAE'])
    mape_list.append(m_result['MAPE'])
    rmse_list.append(m_result['RMSE'])

  return pd.DataFrame({ 'Model Name': model_name,
                        'MAE CV':np.round(np.mean(mae_list),2).astype(str)+' +/- '+np.round(np.std(mae_list),2).astype(str),
                        'MAPE CV':np.round(np.mean(mape_list),2).astype(str)+' +/- '+np.round(np.std(mape_list),2).astype(str),
                        'RMSE CV':np.round(np.mean(rmse_list),2).astype(str)+' +/- '+np.round(np.std(rmse_list),2).astype(str),
                        },index=[0])
def mean_absolute_percentage_error(y,yhat):
    return np.mean(np.abs((y-yhat)/y))

def ml_error(model_name,y,yhat):
    from sklearn.metrics import mean_absolute_error,mean_squared_error
    
    mae=mean_absolute_error(y,yhat)
    mape=mean_absolute_percentage_error(y,yhat)
    rmse=np.sqrt(mean_squared_error(y,yhat))
    
    return pd.DataFrame({'Model name': model_name,
                        'MAE': mae,
                        'MAPE': mape,
                        'RMSE': rmse}, index=[0])

# 8.0 Hyperparameter fine tuning

Iremos fazer o fine tuning para o modelo que selecionamos para seguir para produção. Na prática o Random Forest foi o modelo que performou melhor, porém pelo alto custo computacional, resolvemos escolher o XGBoost. Por essa razão, também aplicaremos o finetuning usando esse modelo.

## 8.1 Random Search [NÃO RODAR LOCAL]

param={
    'n_estimators':[1500,1700,2500,3000,3500],
    'eta':[0.01,0.03],
    'max_depth':[3,5,9],
    'subsample':[0.1,0.5,0.7],
    'colsample_bytree':[0.3,0.7,0.9],
    'min_child_weight':[3,8,15]
        }

MAX_EVAL=10

final_resul=pd.DataFrame()

for i in range(MAX_EVAL)
    # choose values for parameters randomlu
    hp = {k: random.sample(v,1)[0] for k,v in param.items()}
    print(hp)
    
    #model
    model_xgb=xgb.XGBRegressor(objective='reg:squarederror',
                               n_estimators=hp['n_estimators'],
                               eta=hp['eta'],
                               max_depth=hp['max_depth'],
                               subsample=hp['subsample'],
                               colsample_bytree=hp['colsample_bytree'],
                               min_child_weight=hp['min_child_weight'])
    
    #performance
    model_xgb_result=cross_validation('XGBoost Regressor',x_training,5,model_xgb,verbose=False)
    final_result=pd.concat([final_result,model_xgb_result])
    
final_result

Inline-style: 
![alt text](/media/svncjus/vdsshd3/1_ds/__old/Study_DSP/dsemproducao/final_result.png
"final_result")

## 8.1 Final Model

In [5]:
#parametros mais otimizados de acordo com o resultado final (a foto está no repositório), 
# pois não processamos localmente as iterações do Random Search

param_tuned={
    'n_estimators':3000,
    'eta':0.03,
    'max_depth':5,
    'subsample':0.7,
    'colsample_bytree':0.7,
    'min_child_weight':3
        }

In [7]:
#!pip install xgboost

Collecting xgboost
  Using cached xgboost-1.4.1-py3-none-manylinux2010_x86_64.whl (166.7 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.4.1


In [11]:
#model
# run conda install xgboot on terminal
import xgboost as xgb

model_xgb_tuned=xgb.XGBRegressor(objective='reg:squarederror',
                           n_estimators=param_tuned['n_estimators'],
                           eta=param_tuned['eta'],
                           max_depth=param_tuned['max_depth'],
                           subsample=param_tuned['subsample'],
                           colsample_bytree=param_tuned['colsample_bytree'],
                           min_child_weight=param_tuned['min_child_weight']).fit(x_train,y_train)

In [12]:
#prediction
yhat_model_xgb_tuned=model_xgb_tuned.predict(x_test)

In [14]:
#performance
model_xgb_result_tuned=ml_error('XGBoost Regressor',np.expm1(y_test),np.expm1(yhat_model_xgb_tuned))

In [15]:
model_xgb_result_tuned

Unnamed: 0,Model name,MAE,MAPE,RMSE
0,XGBoost Regressor,765.146464,0.116287,1093.511057


In [16]:
with open('code8_model_xgb_result_tuned.pkl', 'wb') as f:
    pickle.dump(model_xgb_result_tuned, f)