In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Carregando as bibliotecas

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import plotly.express as px

from plotly.subplots import make_subplots
import plotly.graph_objects as go

from statsmodels.tsa.seasonal import seasonal_decompose
from scipy import stats

from sklearn.impute import KNNImputer

import statsmodels.api as sm

# Explicação e Leitura dos Arquivos:

- train: Dados de treino
O objetivo é fazer predição das vendas por loja e produtos, com as seguintes variaves:
    - id: ID das vendas
    - store_nbr: As lojas em que os produtos foram vendidos.
    - family: Identifica os tipos de produtos vendidos.
    - sales: O valor de venda de um tipo de produto, loja e data especifica. É possivel existir valores fracionados devido ao tipo de produto, geralmente quando comida são fracionados. (1.5 kg of cheese, for instance, as opposed to 1 bag of chips).
    - onpromotion: Especifica se um determinado tipo de produto está em promoção.

- test: Dados de teste
The dates in the test data are for the 15 days after the last date in the training data.

- stores.csv:
Store metadata, including city, state, type, and cluster.
cluster is a grouping of similar stores.

- oil.csv:
Daily oil price. Includes values during both the train and test data timeframes. (Ecuador is an oil-dependent country and it's economical health is highly vulnerable to shocks in oil prices.)
- holidays_events.csv

    NOTE: Pay special attention to the transferred column. A holiday that is transferred officially falls on that calendar day, but was moved to another date by the government. A transferred day is more like a normal day than a holiday. To find the day that it was actually celebrated, look for the corresponding row where type is Transfer. For example, the holiday Independencia de Guayaquil was transferred from 2012-10-09 to 2012-10-12, which means it was celebrated on 2012-10-12. Days that are type Bridge are extra days that are added to a holiday (e.g., to extend the break across a long weekend). These are frequently made up by the type Work Day which is a day not normally scheduled for work (e.g., Saturday) that is meant to payback the Bridge.

    Additional holidays are days added a regular calendar holiday, for example, as typically happens around Christmas (making Christmas Eve a holiday).

- Additional Notes
Wages in the public sector are paid every two weeks on the 15 th and on the last day of the month. Supermarket sales could be affected by this.
A magnitude 7.8 earthquake struck Ecuador on April 16, 2016. People rallied in relief efforts donating water and other first need products which greatly affected supermarket sales for several weeks after the earthquake.

In [None]:
treino = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/train.csv')
teste = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/test.csv')
stores = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/stores.csv')
petroleo = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/oil.csv')
feriados = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv')

## Explorando os Dados de Treino

**1. Visualização e entedimentos dos dados de TREINO**

In [None]:
treino.head()

In [None]:
treino.info('all')

**1.1.** Valores Unicos por variavel 

In [None]:
df = pd.DataFrame(data = treino.nunique()).reset_index()
df.columns = ['Colunas', 'Qtd_Itens_Unicos']
df

In [None]:
treino['date']     = treino['date'].astype('datetime64')
treino["mes_date"] = treino['date'].dt.month
treino['ano_date'] = treino['date'].dt.year
treino["dia_date"] = treino['date'].dt.day
treino['anomes']   = treino['date'].dt.to_period('m')
treino['anomes']   = treino['anomes'].astype(str).str.replace('-', '')

**2. Plotando a série Temporal**

In [None]:
fig = make_subplots(
    rows=9, cols=3, subplot_titles=('Série Temporal dos Dados sem agrupamento',
                                    "Serie temporal acumulada por Ano e Mês",
                                    "Série Temporal Mensal por Ano - Verificar Sazonalidade",
                                    'Vendas Acumuladas por Ano', 
                                    'Vendas Acumuladas por Mes', 
                                    'Vendas Acumuladas por Dia'),
    specs = [  [{"rowspan": 2, "colspan": 3}, None, None],
               [None, None, None],
               [{"rowspan": 2, "colspan": 3}, None, None],
               [None, None, None],
               [None, None, None],
               [{"rowspan": 2, "colspan": 3}, None, None],
               [None, None, None],
               [{"rowspan": 2}, {"rowspan": 2}, {"rowspan": 2}],
               [None, None, None]
            ]
)


fig.add_trace(go.Scatter(x = treino['date'], y = treino['sales']), row = 1, col =1)

dfi = treino[['anomes', "sales"]].groupby('anomes').sum()
fig.add_trace(go.Scatter(x = dfi.index, y = dfi['sales']), row=3, col=1)

dfi = treino[["ano_date", "mes_date", "sales"]].groupby(["ano_date", "mes_date"]).sum().reset_index()
listplot = []

for ano in dfi['ano_date'].unique():
    fig.add_trace(go.Scatter(x = dfi.loc[dfi['ano_date'] == ano, 'mes_date'], y = dfi.loc[dfi['ano_date'] == ano, 'sales'], name = str(ano)), row = 6, col = 1)

dfi = treino[["ano_date", "sales"]].groupby('ano_date').sum()
fig.add_trace(go.Bar(x = dfi.index, y = dfi['sales']), row=8, col=1)

dfi = treino[["mes_date", "sales"]].groupby('mes_date').sum()
fig.add_trace(go.Bar(x = dfi.index, y = dfi['sales']), row=8, col=2)

dfi = treino[["dia_date", "sales"]].groupby('dia_date').sum()
fig.add_trace(go.Bar(x = dfi.index, y = dfi['sales']), row=8, col=3)


fig.update_layout(height=900)

#dp.pivot(index = 'mes_date', columns = 'ano_date', values = 'sales').plot(ax = ax[5], title = 'Visualizando a Sazonalidade')

#plt.tight_layout()
fig.show()

In [None]:
px.bar(treino[['family', 'sales']].groupby('family').sum().reset_index().sort_values('sales', ascending =False), 
       x = 'family', y = 'sales', 
       height = 400, 
       title = 'Vendas por Tipo de produtos')

In [None]:
px.bar(treino[['store_nbr', 'sales']].groupby('store_nbr').sum().reset_index().sort_values('sales', ascending = False), 
       x = 'store_nbr', y = 'sales', 
       height = 400, 
       title = 'Vendas por Loja')

## Explorando dados de Petroleo

**3. Visualizando os dados do PETROLEO:**

In [None]:
petroleo = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/oil.csv')

In [None]:
petroleo.info()
print('')
petroleo.describe(datetime_is_numeric=True)

In [None]:

petroleo.head()

In [None]:
petroleo['date'] = petroleo['date'].astype('datetime64')


In [None]:
petroleo['dia_semana'] = petroleo['date'].dt.dayofweek

In [None]:
petroleo = petroleo.set_index('date')

In [None]:
px.line(petroleo, x = petroleo.index, y = 'dcoilwtico', title = 'Evolução do preço do Petroleo', height = 300)

In [None]:
petro = petroleo[~pd.isna(petroleo.dcoilwtico)]
petro.describe()

In [None]:
px.line(petro, x = petro.index, y = 'dcoilwtico', title = 'Evolução do preço do Petroleo', height = 300)

#### Decomposição da Serie Temporal - VALORES (NA) EXCLUIDOS

In [None]:
deco_petro = seasonal_decompose(petro['dcoilwtico'], model = 'mult', period=12)

In [None]:
fig, ax = plt.subplots(3, 1, figsize = (15, 9))
deco_petro.trend.plot(ax = ax[0], title = 'tendencia')
deco_petro.seasonal.plot(ax = ax[1], title = 'sazonalidade')
deco_petro.resid.plot(ax = ax[2], title = 'residuos')

plt.tight_layout()

#### Decomposição da Serie Temporal - IMPUTANTION KNN

In [None]:
petro2 = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/oil.csv')

In [None]:
knnimputer = KNNImputer(n_neighbors=10, weights = 'distance')

In [None]:
petro2.head()

In [None]:
vl = petro2['dcoilwtico'].values.reshape((-1, 1))
vlt = knnimputer.fit_transform(vl)
petro2['dcoilwtico'] = vlt

In [None]:
petro2.head()

In [None]:
deco_petro2 = seasonal_decompose(petro2['dcoilwtico'], model = 'mult', period=12)

fig, ax = plt.subplots(3, 1, figsize = (15, 9))
deco_petro.trend.plot(ax = ax[0], title = 'tendencia')
deco_petro.seasonal.plot(ax = ax[1], title = 'sazonalidade')
deco_petro.resid.plot(ax = ax[2], title = 'residuos')

plt.tight_layout()

In [None]:
px.scatter(petro2, x = petro2.index, y = petro2['dcoilwtico'], trendline="lowess", height = 350)

In [None]:
petro2.dcoilwtico.describe()

In [None]:
stats.ttest_rel(petro.dcoilwtico, petro2[0:len(petro)].dcoilwtico)


**Conclusão:**  

Não há evidências para rejeitar a hipotese NULA, ou seja os valores possuem médias identica, portanto qualquer um dos datasets poderá ser utilizado.


#### Verificando a Estacionariedade da variavel Petroleo

#### Teste de normalidade

In [None]:

k, p = stats.normaltest(petro.dcoilwtico)

if p < 0.05:
    print('Há signficância estatistica de {:0f} - para rejeitar a hipotese nula'.format(p))
    print('A série temporal NÂO ESTACIONARIA')
    
else:
    print('A Série Temporal É ESTACIONARIA')
    


In [None]:
petro = petro.reset_index()
petro = petro.set_index('date')

In [None]:
fig, ax = plt.subplots(figsize = (15, 5))

ax = sm.graphics.tsa.plot_acf(petro['dcoilwtico'], lags=100, ax = ax)



In [None]:
petro['oil_1t'] = petro['dcoilwtico'].shift()
petro['oil_2t'] = petro['dcoilwtico'].shift(2)
petro['oil_3t'] = petro['dcoilwtico'].shift(3)
petro['oil_7t'] = petro['dcoilwtico'].shift(7)
petro['oil_14t'] = petro['dcoilwtico'].shift(14)
petro['oil_30t'] = petro['dcoilwtico'].shift(30)
petro['oil_60t'] = petro['dcoilwtico'].shift(60)
petro['oil_90t'] = petro['dcoilwtico'].shift(90)

In [None]:

petro.head()

In [None]:
def vl_transform(x):
    lt = []
    #print(x)
    for n in x.values:
        #print(n)
        if (n < 0) | (n == 0):
            print(n)
            lt.append(1)
        else:
            lt.append(n)
    x = lt
    #print(x)
    return stats.boxcox(x)
    

In [None]:
vls = petro['dcoilwtico']
vlt = vl_transform(vls)


In [None]:
petroleo['oil_trasf'] = vlt[0]

In [None]:
petroleo.describe()

In [None]:
stat, p = stats.shapiro(petroleo.dcoilwtico)
print('Variavel real - Statistics = %.3f, p = %.3f' % (stat, p))
print('')
stat, p = stats.shapiro(petroleo.oil_trasf)
print('Variavel Transformada BxCx - Statistics = %.3f, p = %.3f' % (stat, p))

**3.1.** JOIN com os dados das vendas - DADOS TREINO

In [None]:
treino = treino.merge(petroleo, left_on = 'date', right_on = 'date')

In [None]:
treino.tail() 

**3.2.** Correlação da Serie Temporal entre variaveis 

In [None]:
dfi = treino[['date', 'dcoilwtico', 'sales']].groupby('date').agg({'dcoilwtico':'max', 'sales':'sum'})
dfi['oil_7dias'] = dfi['dcoilwtico'].shift(7)
dfi['oil_15dias'] = dfi['dcoilwtico'].shift(15)
dfi['oil_30dias'] = dfi['dcoilwtico'].shift(30)
dfi['oil_60dias'] = dfi['dcoilwtico'].shift(60)
dfi['oil_90dias'] = dfi['dcoilwtico'].shift(90)


In [None]:
coef_corr = []
colunas = ['dcoilwtico', 'oil_7dias', 'oil_15dias', 'oil_30dias', 'oil_60dias', 'oil_90dias']

for c in colunas:
    coef_corr.append(dfi[[c, 'sales']].corr().values[1, 0].reshape((-1, 1))[0, 0])
    #print(dfi[[c, 'sales']].corr().values[1, 0].reshape((-1, 1))[0, 0])

In [None]:

fig = make_subplots(
    rows=3, cols=2, subplot_titles=('Valor do petroleo x Vendas - R2:' + str(np.round(coef_corr[0], 2)),
                                    'Valor do petroleo 7 Dias Atrás x Vendas - R2:' + str(np.round(coef_corr[1], 2)),
                                    'Valor do petroleo 15 Dias Atrás x Vendas - R2:' + str(np.round(coef_corr[2], 2)),
                                    'Valor do petroleo 30 Dias Atrás x Vendas - R2:' + str(np.round(coef_corr[3], 2)), 
                                    'Valor do petroleo 60 Dias Atrás x Vendas - R2:' + str(np.round(coef_corr[4], 2)), 
                                    'Valor do petroleo 90 Dias Atrás x Vendas - R2:' + str(np.round(coef_corr[5], 2)))
)

fig.add_trace(go.Scatter(mode='markers', x = dfi['dcoilwtico'], y = dfi['sales']), row = 1, col = 1)
fig.add_trace(go.Scatter(mode='markers', x = dfi['oil_7dias'],  y = dfi['sales']), row = 1, col = 2)
fig.add_trace(go.Scatter(mode='markers', x = dfi['oil_15dias'], y = dfi['sales']), row = 2, col = 1)
fig.add_trace(go.Scatter(mode='markers', x = dfi['oil_30dias'], y = dfi['sales']), row = 2, col = 2)
fig.add_trace(go.Scatter(mode='markers', x = dfi['oil_60dias'], y = dfi['sales']), row = 3, col = 1) 
fig.add_trace(go.Scatter(mode='markers', x = dfi['oil_90dias'], y = dfi['sales']), row = 3, col = 2)

fig.update_layout(height=700)

fig.show()

## Explorando os DATASET dos feriados

In [None]:
feriados.info()
feriados.head()

In [None]:
feriados.describe(datetime_is_numeric= True)

**4.1.** Quantidade de variaveis unicas nos **FERIADOS**

In [None]:
print(feriados.nunique())
print('')
print(feriados['type'].value_counts())

**4.2.** Join aos dados de treino **FERIADOS**

In [None]:
feriados['date'] = feriados['date'].astype('datetime64')


In [None]:
treino = treino.merge(feriados[['date', 'type']], left_on = 'date', right_on = 'date', how = 'left')

In [None]:
treino.head()

## Pré processamento para treinamento

In [None]:
dicfamily = {f: i+1 for i, f in enumerate(treino['family'].unique()) if f != pd.isna(f)}
dicferiado = {f: i+1 for i, f in enumerate(treino['type'].unique()) if False == pd.isna(f)}

In [None]:
treino['CD_FAMILY'] = treino['family'].map(dicfamily)
treino['CD_FERIADO'] = treino['type'].map(dicferiado)

In [None]:
treino.describe(datetime_is_numeric= True)

In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [None]:
rfr = RandomForestRegressor()
gbr = GradientBoostingRegressor()

In [None]:
treino['dcoilwtico'] = treino['dcoilwtico'].fillna(0)
treino['CD_FERIADO'] = treino['CD_FERIADO'].fillna(0)

In [None]:
X.isna().sum()

In [None]:
del X, Y

In [None]:
X_treino = treino[treino['date'] < '2017-06-01'].drop(columns = ['id', 'date', 'family', 'anomes', 'sales', 'type'])
Y_treino = treino.loc[treino['date'] < '2017-06-01', 'sales']
X_teste = treino[treino['date'] >= '2017-06-01'].drop(columns = ['id', 'date', 'family', 'anomes', 'sales', 'type'])
Y_teste = treino.loc[treino['date'] >= '2017-06-01', 'sales']

In [None]:
rfr.fit(X_treino, Y_treino)

In [None]:
rfr.score(X_treino, Y_treino)


In [None]:
previsao = rfr.predict(X_teste)

In [None]:
from sklearn import metrics

In [None]:
metrics.mean_squared_error(Y_teste, previsao)

In [None]:
X_teste['date'] = teste['date']
X_teste['vendas_real'] = Y_teste
X_teste['previsao'] = previsao

In [None]:
X_teste[['date', 'vendas_real', 'previsao']].plot(figsize = (15, 5))

## Preparando os dados de TESTE ##

In [None]:
teste.info()

In [None]:
teste.describe(datetime_is_numeric=True)

In [None]:
teste['date']     = teste['date'].astype('datetime64') 
teste["mes_date"] = teste['date'].dt.month
teste['ano_date'] = teste['date'].dt.year
teste["dia_date"] = teste['date'].dt.day
teste['anomes']   = teste['date'].dt.to_period('m')
teste['anomes']   = teste['anomes'].astype(str).str.replace('-', '')

In [None]:
teste = teste.merge(petroleo, left_on = 'date', right_on = 'date')

In [None]:
teste = teste.merge(feriados[['date', 'type']], left_on = 'date', right_on = 'date', how = 'left')

In [None]:
teste['CD_FAMILY'] =  teste['family'].map(dicfamily)
teste['CD_FERIADO'] = teste['type'].map(dicferiado)

In [None]:
rfr.