In [1]:
import pandas as pd
import seaborn as sns
import warnings
from matplotlib import pyplot as plt
from IPython.core.display import HTML
from scipy import stats as ss

warnings.filterwarnings( 'ignore' )

In [2]:
%matplotlib inline
%pylab inline

plt.style.use( 'bmh' )
plt.rcParams['figure.figsize'] = [25, 12]
plt.rcParams['font.size'] = 10

Populating the interactive namespace from numpy and matplotlib


In [3]:
df6 = pd.read_pickle('df5.pkl')
num = pd.read_pickle('num_attributes2.pkl')
cat = pd.read_pickle('cat_attributes2.pkl')

# 6.0 Feature Selection

"A explicação mais simples  sobre um fenômeno observado, deveria prevalecer sobre as explicações mais complexas."
-Occam's Razor

In [4]:
cols_drop=['week_of_year','day','month','day_of_week','promo_since','competition_since','year_week']
df6=df6.drop(cols_drop,axis=1)

## 6.1 Test train split

Como o dataset se trata de uma base temporal, não dá para selecionar aleatoriamente as variáveis, pois as seleções de teste e treino ficariam enviesadas e causando overfitting no modelo. É preciso selecionar os dados de teste como sendo o final da base.

In [5]:
df6[['store','date']].groupby('store').min().reset_index()

Unnamed: 0,store,date
0,1,2013-01-02
1,2,2013-01-02
2,3,2013-01-02
3,4,2013-01-02
4,5,2013-01-02
...,...,...
1110,1111,2013-01-02
1111,1112,2013-01-02
1112,1113,2013-01-02
1113,1114,2013-01-02


In [6]:
df6[['store','date']].groupby('store').max().reset_index()

Unnamed: 0,store,date
0,1,2015-07-31
1,2,2015-07-31
2,3,2015-07-31
3,4,2015-07-31
4,5,2015-07-31
...,...,...
1110,1111,2015-07-31
1111,1112,2015-07-31
1112,1113,2015-07-31
1113,1114,2015-07-31


In [7]:
# quero o dia 6 semanas antes da última data do dataset
df6[['store','date']].groupby('store').max().reset_index()['date'][0] - datetime.timedelta(days=6*7)

Timestamp('2015-06-19 00:00:00')

In [8]:
# train dataset
X_train=df6[df6['date'] < '2015-06-19']
Y_train=X_train['sales']

# test dataset
X_test=df6[df6['date'] >= '2015-06-19']
Y_test=X_test['sales']

print('Train Min Date: {}'.format(X_train['date'].min()))
print('Train Max Date: {}'.format(X_train['date'].max()))

print('\nTest Min Date: {}'.format(X_test['date'].min()))
print('Test Max Date: {}'.format(X_test['date'].max()))

Train Min Date: 2013-01-01 00:00:00
Train Max Date: 2015-06-18 00:00:00

Test Min Date: 2015-06-19 00:00:00
Test Max Date: 2015-07-31 00:00:00


## 6.2 Feature Selection - Wrapper Method (Boruta)

In [9]:
# define RandomForestRegressor

# from sklearn.ensemble import RandomForestRegressor
# rf=RandomForestRegressor(n_jobs=1)

In [10]:
X_train

Unnamed: 0,store,date,sales,promo,school_holiday,store_type,assortment,competition_distance,competition_open_since_month,competition_open_since_year,...,state_holiday_public_holiday,state_holiday_regular_day,day_of_week_sin,day_of_week_cos,month_sin,month_cos,day_sin,day_cos,week_of_year_sin,week_of_year_cos
47945,1,2015-06-18,8.443762,1,0,2,1,-0.170968,9,2008,...,0,1,-0.433884,-0.900969,1.224647e-16,-1.000000,-0.485302,-0.874347,0.120537,-0.992709
47946,2,2015-06-18,8.547722,1,0,0,1,-0.283871,11,2007,...,0,1,-0.433884,-0.900969,1.224647e-16,-1.000000,-0.485302,-0.874347,0.120537,-0.992709
47947,3,2015-06-18,8.927712,1,0,0,1,1.903226,12,2006,...,0,1,-0.433884,-0.900969,1.224647e-16,-1.000000,-0.485302,-0.874347,0.120537,-0.992709
47948,4,2015-06-18,9.091669,1,0,2,3,-0.275806,9,2009,...,0,1,-0.433884,-0.900969,1.224647e-16,-1.000000,-0.485302,-0.874347,0.120537,-0.992709
47949,5,2015-06-18,8.502080,1,0,0,1,4.448387,4,2015,...,0,1,-0.433884,-0.900969,1.224647e-16,-1.000000,-0.485302,-0.874347,0.120537,-0.992709
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1016776,682,2013-01-01,8.124447,0,1,1,1,-0.351613,9,2006,...,1,0,0.974928,-0.222521,5.000000e-01,0.866025,0.201299,0.979530,0.120537,0.992709
1016827,733,2013-01-01,9.284148,0,1,1,2,-0.237097,10,1999,...,1,0,0.974928,-0.222521,5.000000e-01,0.866025,0.201299,0.979530,0.120537,0.992709
1016863,769,2013-01-01,8.524367,0,1,1,2,-0.240323,1,2013,...,1,0,0.974928,-0.222521,5.000000e-01,0.866025,0.201299,0.979530,0.120537,0.992709
1017042,948,2013-01-01,8.410053,0,1,1,2,-0.145161,1,2013,...,1,0,0.974928,-0.222521,5.000000e-01,0.866025,0.201299,0.979530,0.120537,0.992709


In [11]:
#para o boruta,teremos que excluir os dados de treino e teremos que passar apenas arrays numpy

# X_train_n=X_train.drop(['date','sales'],axis=1).values

#ravel puts the values in an array

# Y_train_n=Y_train.values.ravel()

In [12]:
#boruta
# from boruta import BorutaPy

#commented for performance, Boruta takes long to run
#boruta=BorutaPy(rf,n_estimators='auto',verbose=2,random_state=42).fit(X_train_n,Y_train_n)

In [13]:
#ranking based on relevance

# cols_selected=boruta.support_.tolist()

#Best features from Boruta

# X_train_fs=X_train.drop(['date','sales'],axis=1)
# cols_selected_boruta = X_train_fs.iloc[:,cols_selected].columns.to_list()

#not selected boruta

# cols_not_selected_boruta = np.setdiff1d(X_train_fs.columns, cols_selected_boruta)

In [14]:
# cols_selected_boruta

In [15]:
# cols_not_selected_boruta

A partir da seleção feita pelo Boruta, precisamos retornar à EDA e verificar se as variáveis não selecionadas foram consideradas relevantes naquele momento ou não.

De acordo com a EDA, temos que as hipóteses 9,10,11 e 12 foram consideradas de alta relevância. 

Hipoteses   |  Conclusão  |  Relevância  | Variável  | Boruta | Abordagem I1
----------- | ----------- | ------------ | ------------| ------------ | -
H1          | Falsa       | Baixa        |  |  | 
H2          | Falsa       | Media        |  |  | 
H3          | Falsa       | Media        |  |  | 
H4          | Falsa       | Baixa |  |  | 
H5          | -           | - |  |  | 
H7          | Falsa       | Baixa | |  |  
H8          | Falsa       | Media |  |  | 
H9          | Falsa       | Alta | `year` | não relevante | não incluir
H10         | Falsa       | Alta | `month` | só incluiu `month_cos` | incluir `month_sin`
H11         | Verdadeira  | Alta | `day` | relevante | nada a fazer
H12         | Verdadeira  | Alta | `day_of_week` | relvante | nada a fazer
H13         | Verdadeira  | Baixa |  | 

In [16]:
# manually select features that Boruta selected

cols_selected_boruta = ['store','promo','store_type','assortment','competition_distance',
                        'competition_open_since_month','competition_open_since_year','promo2'
                       ,'promo2_since_week','promo2_since_year','competition_time_month'
                       ,'promo_time_week','day_of_week_sin','day_of_week_cos','month_cos','month_sin'
                       ,'day_sin','day_cos','week_of_year_sin','week_of_year_cos']

#columns to add
feat_to_add = ['date','sales']

cols_selected_boruta.extend

In [17]:
# df6.to_pickle('df6.pkl')
# num_.to_pickle('num_attributes2.pkl')
# cat_attributes.to_pickle('cat_attributes2.pkl')