In [5]:
!pip install pickle5



In [6]:
import pandas as pd
import seaborn as sns
import warnings
import pickle5 as pickle
from matplotlib import pyplot as plt
from IPython.core.display import HTML
from scipy import stats as ss

warnings.filterwarnings( 'ignore' )

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
%matplotlib inline
%pylab inline

plt.style.use( 'bmh' )
plt.rcParams['figure.figsize'] = [25, 12]
plt.rcParams['font.size'] = 10

Populating the interactive namespace from numpy and matplotlib


In [9]:
with open('/content/drive/MyDrive/dsp/code6_cols_selected_boruta.pkl', 'rb') as f:
    cols_selected_boruta = pickle.load(f)
with open('/content/drive/MyDrive/dsp/code6_feat_to_add.pkl', 'rb') as f:
    feat_to_add = pickle.load(f)
with open('/content/drive/MyDrive/dsp/code6_df6.pkl', 'rb') as f:
    df7 = pickle.load(f)
with open('/content/drive/MyDrive/dsp/code6_X_train.pkl', 'rb') as f:
    X_train = pickle.load(f)
with open('/content/drive/MyDrive/dsp/code6_X_test.pkl', 'rb') as f:
    X_test = pickle.load(f)
with open('/content/drive/MyDrive/dsp/code6_Y_train.pkl', 'rb') as f:
    Y_train = pickle.load(f)
with open('/content/drive/MyDrive/dsp/code6_Y_test.pkl', 'rb') as f:
    Y_test = pickle.load(f)

# 7.0 ML Analysis and Techniques

In [10]:
cols_selected_boruta=['store',
 'promo',
 'store_type',
 'assortment',
 'competition_distance',
 'competition_open_since_month',
 'competition_open_since_year',
 'promo2',
 'promo2_since_week',
 'promo2_since_year',
 'competition_time_month',
 'promo_time_week',
 'day_of_week_sin',
 'day_of_week_cos',
 'month_cos',
 'month_sin',
 'day_sin',
 'day_cos',
 'week_of_year_sin',
 'week_of_year_cos']

In [11]:
x_train=X_train[cols_selected_boruta]
x_test=X_test[cols_selected_boruta]

In [12]:
def mean_absolute_percentage_error(y,yhat):
    return np.mean(np.abs((y-yhat)/y))

def ml_error(model_name,y,yhat):
    from sklearn.metrics import mean_absolute_error,mean_squared_error
    
    mae=mean_absolute_error(y,yhat)
    mape=mean_absolute_percentage_error(y,yhat)
    rmse=np.sqrt(mean_squared_error(y,yhat))
    
    return pd.DataFrame({'Model name': model_name,
                        'MAE': mae,
                        'MAPE': mape,
                        'RMSE': rmse}, index=[0])

## 7.1 Average Model

In [13]:
aux1=x_test.copy()
aux1['sales']=Y_test.copy()

#prediciton
aux2=aux1[['store','sales']].groupby('store').mean().reset_index().rename(columns={'sales':'prediction'})
aux1=pd.merge(aux1,aux2,how='left',on='store')
#the preditions 'y_hat'
yhat_baseline=aux1['prediction']

#performance
# nas transformações a classe foi normalizada pela logaritmica, agora para avaliar aperformance, 
# precisamos resgatar o valor original das vendas, aplicando a transformação inversa, no caso a exponencial
baseline_result = ml_error('Average Model',np.expm1(Y_test),np.expm1(yhat_baseline))
baseline_result

Unnamed: 0,Model name,MAE,MAPE,RMSE
0,Average Model,1354.800353,0.455051,1835.135542


## 7.2 Linear Regression

In [14]:
from sklearn.linear_model import LinearRegression
#model
lr=LinearRegression().fit(x_train,Y_train)

In [15]:
#prediction
yhat_lr=lr.predict(x_test)

#performance
lr_result=ml_error('Linear Regression',np.expm1(Y_test),np.expm1(yhat_lr))
lr_result

Unnamed: 0,Model name,MAE,MAPE,RMSE
0,Linear Regression,1867.654504,0.292836,2671.333847


## 7.3. Linear Regression Regularized

In [16]:
from sklearn.linear_model import Lasso

#model
# the higher the alpha, the most feature coefficients are zero, that is more generalised
lrr=Lasso(alpha=0.001).fit(x_train,Y_train)

In [17]:
#prediction
yhat_lrr=lrr.predict(x_test)

#performance
lrr_result=ml_error('Linear Regression Lasso',np.expm1(Y_test),np.expm1(yhat_lrr))
lrr_result

Unnamed: 0,Model name,MAE,MAPE,RMSE
0,Linear Regression Lasso,1870.005016,0.28825,2694.092897


Performance da regressão linear foi de maior erro que a média, o que indica que esse comportamento provavelmente não é linear.

## 7.4. Random Forest Regressor

In [18]:
from sklearn.ensemble import RandomForestRegressor

#model
rf=RandomForestRegressor(n_estimators=100,n_jobs=1,random_state=42).fit(x_train,Y_train)

In [19]:
#prediction
yhat_rf=rf.predict(x_test)

#performance
rf_result=ml_error('Random Forest Regressor',np.expm1(Y_test),np.expm1(yhat_rf))
rf_result

Unnamed: 0,Model name,MAE,MAPE,RMSE
0,Random Forest Regressor,677.627906,0.099857,1007.328966


## 7.5. XGBoost Regressor

In [20]:
import xgboost as xgb

#model
model_xgb=xgb.XGBRegressor(objective='reg:squarederror',
                           n_estimators=100,
                           eta=0.01,
                           max_depth=10,
                           subsample=0.7,
                           colsample_bytree=0.9).fit(x_train,Y_train)

In [21]:
#prediction
yhat_model_xgb=model_xgb.predict(x_test)

#performance
model_xgb_result=ml_error('XGBoost Regressor',np.expm1(Y_test),np.expm1(yhat_model_xgb))
model_xgb_result

Unnamed: 0,Model name,MAE,MAPE,RMSE
0,XGBoost Regressor,871.007756,0.126646,1282.177812


## 7.6. Performance Overview

In [22]:
mlanalysis_result=pd.concat([baseline_result,lr_result,lrr_result,rf_result,model_xgb_result])
mlanalysis_result.sort_values('RMSE')

Unnamed: 0,Model name,MAE,MAPE,RMSE
0,Random Forest Regressor,677.627906,0.099857,1007.328966
0,XGBoost Regressor,871.007756,0.126646,1282.177812
0,Average Model,1354.800353,0.455051,1835.135542
0,Linear Regression,1867.654504,0.292836,2671.333847
0,Linear Regression Lasso,1870.005016,0.28825,2694.092897


## 7.7. Cross Validation

In [23]:
feat_to_add

['date', 'sales']

In [24]:
cols_selected_boruta_full=cols_selected_boruta.copy()

In [25]:
cols_selected_boruta_full.extend(feat_to_add)

In [26]:
cols_selected_boruta_full

['store',
 'promo',
 'store_type',
 'assortment',
 'competition_distance',
 'competition_open_since_month',
 'competition_open_since_year',
 'promo2',
 'promo2_since_week',
 'promo2_since_year',
 'competition_time_month',
 'promo_time_week',
 'day_of_week_sin',
 'day_of_week_cos',
 'month_cos',
 'month_sin',
 'day_sin',
 'day_cos',
 'week_of_year_sin',
 'week_of_year_cos',
 'date',
 'sales']

In [27]:
x_training=X_train[cols_selected_boruta_full]

In [28]:
x_training.head()

Unnamed: 0,store,promo,store_type,assortment,competition_distance,competition_open_since_month,competition_open_since_year,promo2,promo2_since_week,promo2_since_year,competition_time_month,promo_time_week,day_of_week_sin,day_of_week_cos,month_cos,month_sin,day_sin,day_cos,week_of_year_sin,week_of_year_cos,date,sales
47945,1,1,2,1,-0.170968,9,2008,0,25,2015,0.891892,0.287016,-0.433884,-0.900969,-1.0,1.224647e-16,-0.485302,-0.874347,0.120537,-0.992709,2015-06-18,8.443762
47946,2,1,0,1,-0.283871,11,2007,1,13,2010,1.027027,0.908884,-0.433884,-0.900969,-1.0,1.224647e-16,-0.485302,-0.874347,0.120537,-0.992709,2015-06-18,8.547722
47947,3,1,0,1,1.903226,12,2006,1,14,2011,1.189189,0.788155,-0.433884,-0.900969,-1.0,1.224647e-16,-0.485302,-0.874347,0.120537,-0.992709,2015-06-18,8.927712
47948,4,1,2,3,-0.275806,9,2009,0,25,2015,0.72973,0.287016,-0.433884,-0.900969,-1.0,1.224647e-16,-0.485302,-0.874347,0.120537,-0.992709,2015-06-18,9.091669
47949,5,1,0,1,4.448387,4,2015,0,25,2015,-0.189189,0.287016,-0.433884,-0.900969,-1.0,1.224647e-16,-0.485302,-0.874347,0.120537,-0.992709,2015-06-18,8.50208


In [29]:
# último periodo de 6 semanas para para validation
# k sendo o fold da iteração
# nesse caso começamos com o última período de 6 semanas sendo a validação, k=1 
# no k=2, usamos o penúltimo período, etc.
# precisamos começar então do último k, só para iniciarmos com o primeiro intervalo de training 
# e validation cronologicamente 
def cross_validation(model_name,x_training,kfold,model,verbose=False):
  mae_list=[]
  mape_list=[]
  rmse_list=[]

  for k in range(kfold,0,-1):
    if verbose:
      print('\nKfold number: {}'.format(k))
    val_start_date=x_training['date'].max()- datetime.timedelta(days=k*6*7)
    val_end_date=x_training['date'].max()- datetime.timedelta(days=(k-1)*6*7)

    #filtering dataset
    training = x_training[x_training['date'] < val_start_date]
    validation = x_training[(x_training['date'] >= val_start_date) & (x_training['date'] >= val_end_date)]

    # training and validation dataset
    xtraining=training.drop(['date','sales'],axis=1)
    ytraining=training['sales']
    xvalidation=validation.drop(['date','sales'],axis=1)
    yvalidation=validation['sales']

    #model
    m=model.fit(xtraining,ytraining)

    #prediction
    yhat=m.predict(xvalidation)

    #performance
    m_result=ml_error(model_name,np.expm1(yvalidation),np.expm1(yhat))

    #store performance for each kfold iteration
    mae_list.append(m_result['MAE'])
    mape_list.append(m_result['MAPE'])
    rmse_list.append(m_result['RMSE'])

  return pd.DataFrame({ 'Model Name': model_name,
                        'MAE CV':np.round(np.mean(mae_list),2).astype(str)+' +/- '+np.round(np.std(mae_list),2).astype(str),
                        'MAPE CV':np.round(np.mean(mape_list),2).astype(str)+' +/- '+np.round(np.std(mape_list),2).astype(str),
                        'RMSE CV':np.round(np.mean(rmse_list),2).astype(str)+' +/- '+np.round(np.std(rmse_list),2).astype(str),
                        },index=[0])

In [30]:
model=LinearRegression()
a=cross_validation('Linear Regressor',x_training,5,model,verbose=True)


Kfold number: 5

Kfold number: 4

Kfold number: 3

Kfold number: 2

Kfold number: 1


In [31]:
a

Unnamed: 0,Model Name,MAE CV,MAPE CV,RMSE CV
0,Linear Regressor,1937.11 +/- 79.38,0.29 +/- 0.02,2745.97 +/- 154.27


Agora aplicando para os outros modelos já treinados.

In [32]:
lr_result_cv=cross_validation('Linear Regression',x_training,5,lr,verbose=False)

In [33]:
lr_result_cv

Unnamed: 0,Model Name,MAE CV,MAPE CV,RMSE CV
0,Linear Regression,1937.11 +/- 79.38,0.29 +/- 0.02,2745.97 +/- 154.27


In [34]:
lrr_result_cv=cross_validation('Lasso',x_training,5,lrr,verbose=False)
lrr_result_cv

Unnamed: 0,Model Name,MAE CV,MAPE CV,RMSE CV
0,Lasso,1948.62 +/- 89.48,0.28 +/- 0.01,2792.64 +/- 170.62


In [35]:
rf_result_cv=cross_validation('Random Forest',x_training,5,rf,verbose=True)
rf_result_cv


Kfold number: 5

Kfold number: 4

Kfold number: 3

Kfold number: 2

Kfold number: 1


Unnamed: 0,Model Name,MAE CV,MAPE CV,RMSE CV
0,Random Forest,796.17 +/- 149.31,0.11 +/- 0.02,1197.66 +/- 271.98


In [36]:
model_xgb_result_cv=cross_validation('XGBoost',x_training,5,rf,verbose=True)
model_xgb_result_cv


Kfold number: 5

Kfold number: 4

Kfold number: 3

Kfold number: 2

Kfold number: 1


Unnamed: 0,Model Name,MAE CV,MAPE CV,RMSE CV
0,XGBoost,796.17 +/- 149.31,0.11 +/- 0.02,1197.66 +/- 271.98


In [37]:
mlanalysis_result_cv=pd.concat([lr_result_cv,lrr_result_cv,rf_result_cv,model_xgb_result_cv])
mlanalysis_result_cv.sort_values('RMSE CV')

Unnamed: 0,Model Name,MAE CV,MAPE CV,RMSE CV
0,Random Forest,796.17 +/- 149.31,0.11 +/- 0.02,1197.66 +/- 271.98
0,XGBoost,796.17 +/- 149.31,0.11 +/- 0.02,1197.66 +/- 271.98
0,Linear Regression,1937.11 +/- 79.38,0.29 +/- 0.02,2745.97 +/- 154.27
0,Lasso,1948.62 +/- 89.48,0.28 +/- 0.01,2792.64 +/- 170.62


In [38]:
with open('/content/drive/MyDrive/dsp/code7_x_training.pkl', 'wb') as f:
    pickle.dump(x_training, f)
with open('/content/drive/MyDrive/dsp/code7_x_train.pkl', 'wb') as f:
    pickle.dump(x_train, f)
with open('/content/drive/MyDrive/dsp/code7_x_test.pkl', 'wb') as f:
    pickle.dump(x_test, f)