In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime

In [None]:
sales_train_df = pd.read_csv('../input/rossmann-store-sales/train.csv')

In [None]:
sales_train_df.shape

In [None]:
sales_train_df.head()

In [None]:
sales_train_df['DayOfWeek'].unique()

In [None]:
sales_train_df['Open'].unique()

In [None]:
sales_train_df['Promo'].unique()

In [None]:
sales_train_df['StateHoliday'].unique()

In [None]:
sales_train_df['SchoolHoliday'].unique()

In [None]:
sales_train_df.tail()

In [None]:
sales_train_df.info()

In [None]:
sales_train_df.describe()

In [None]:
store_info_df = pd.read_csv('../input/rossmann-store-sales/store.csv')

In [None]:
store_info_df.shape

In [None]:
store_info_df.head()

In [None]:
store_info_df.info()

In [None]:
store_info_df.describe()

In [None]:
sns.heatmap(sales_train_df.isnull());

In [None]:
sales_train_df.isnull().sum()

In [None]:
sales_train_df.hist(bins = 30, figsize=(20, 20), color = 'r')

In [None]:
sales_train_df['Customers'].max()

In [None]:
closed_train_df = sales_train_df[sales_train_df['Open'] == 0]
open_train_df = sales_train_df[sales_train_df['Open'] == 1]

In [None]:
print('Total = ', len(sales_train_df))
print('Número de lojas/dias fechado = ', len(closed_train_df))
print('Número de lojas/dias aberto = ', len(open_train_df))

In [None]:
172817 / len(store_info_df)

In [None]:
closed_train_df.head()

In [None]:
sales_train_df = sales_train_df[sales_train_df['Open'] == 1]

In [None]:
sales_train_df.shape

In [None]:
sales_train_df

In [None]:
sales_train_df.drop(['Open'], axis = 1, inplace=True)

In [None]:
sales_train_df.head()

In [None]:
sales_train_df.describe()

In [None]:
sns.heatmap(store_info_df.isnull(), cbar=False);

In [None]:
store_info_df[store_info_df['CompetitionDistance'].isnull()]

In [None]:
store_info_df[store_info_df['CompetitionOpenSinceMonth'].isnull()]

In [None]:
store_info_df[store_info_df['CompetitionOpenSinceYear'].isnull()]

In [None]:
store_info_df[store_info_df['Promo2'] == 0]

In [None]:
str_cols = ['Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval',
            'CompetitionOpenSinceYear', 'CompetitionOpenSinceMonth']
for str in str_cols:
  store_info_df[str].fillna(0, inplace=True)

In [None]:
sns.heatmap(store_info_df.isnull(), cbar = False);

In [None]:
store_info_df['CompetitionDistance'].fillna(store_info_df['CompetitionDistance'].mean(), inplace = True)

In [None]:
sns.heatmap(store_info_df.isnull(), cbar = False);

In [None]:
store_info_df.hist(bins = 30, figsize=(20,20), color = 'r')

In [None]:
sales_train_df.head()

In [None]:
store_info_df.head()

In [None]:
sales_train_all_df = pd.merge(sales_train_df, store_info_df, how = 'inner', on = 'Store')

In [None]:
sales_train_all_df.shape

In [None]:
sales_train_all_df.tail()

In [None]:
correlations = sales_train_all_df.corr()
f, ax = plt.subplots(figsize=(20,20))
sns.heatmap(correlations, annot = True);

In [None]:
correlations = sales_train_all_df.corr()['Sales'].sort_values()
correlations

In [None]:
sales_train_all_df['Year'] = pd.DatetimeIndex(sales_train_all_df['Date']).year

In [None]:
sales_train_all_df.head()

In [None]:
sales_train_all_df['Month'] = pd.DatetimeIndex(sales_train_all_df['Date']).month
sales_train_all_df['Day'] = pd.DatetimeIndex(sales_train_all_df['Date']).day

In [None]:
sales_train_all_df.head()

In [None]:
axis = sales_train_all_df.groupby('Month')[['Sales']].mean().plot(figsize = (10,5), marker = 'o', color = 'r')
axis.set_title('Média de vendas por mês')

In [None]:
axis = sales_train_all_df.groupby('Month')[['Customers']].mean().plot(figsize = (10,5), marker = '^', color = 'b')
axis.set_title('Média de clientes por mês')

In [None]:
axis = sales_train_all_df.groupby('Day')[['Sales']].mean().plot(figsize = (10,5), marker = 'o', color = 'r')
axis.set_title('Média de vendas por dia')

In [None]:
axis = sales_train_all_df.groupby('Day')[['Customers']].mean().plot(figsize = (10,5), marker = '^', color = 'b')
axis.set_title('Média de clientes por dia')

In [None]:
axis = sales_train_all_df.groupby('DayOfWeek')[['Sales']].mean().plot(figsize = (10,5), marker = 'o', color = 'r')
axis.set_title('Média de vendas por dia da semana')

In [None]:
axis = sales_train_all_df.groupby('DayOfWeek')[['Customers']].mean().plot(figsize = (10,5), marker = '^', color = 'b')
axis.set_title('Média de clientes por dia da semana')

In [None]:
fig, ax = plt.subplots(figsize = (20,10))
sales_train_all_df.groupby(['Date', 'StoreType']).mean()['Sales'].unstack().plot(ax = ax)

In [None]:
sns.barplot(x = 'Promo', y = 'Sales', data = sales_train_all_df);

In [None]:
sns.barplot(x = 'Promo', y = 'Customers', data = sales_train_all_df);

## Treinamento do modelo - parte 1

In [None]:
!pip install fbprophet

In [None]:
from fbprophet import Prophet

In [None]:
# Date: ds
# Sales: y
def sales_prediction(store_id, sales_df, periods):
  sales_df = sales_df[sales_df['Store'] == store_id]
  sales_df = sales_df[['Date', 'Sales']].rename(columns = {'Date': 'ds', 'Sales': 'y'})
  sales_df = sales_df.sort_values(by = 'ds')

  model = Prophet()
  model.fit(sales_df)
  future = model.make_future_dataframe(periods = periods)
  forecast = model.predict(future)
  figure1 = model.plot(forecast, xlabel = 'Data', ylabel = 'Vendas')
  figure2 = model.plot_components(forecast)

  return sales_df, forecast

In [None]:
#df = sales_prediction(10, sales_train_all_df, 60)
#df

In [None]:
df_origin, df_prediction = sales_prediction(10, sales_train_all_df, 60)

In [None]:
df_origin.shape, df_prediction.shape

In [None]:
df_prediction.head()

In [None]:
df_prediction.tail(60)

In [None]:
df_prediction.tail(60).to_csv('previsoes_vendas.csv')

In [None]:
df_origin.tail()

In [None]:
def sales_prediction(store_id, sales_df, holidays, periods):
  sales_df = sales_df[sales_df['Store'] == store_id]
  sales_df = sales_df[['Date', 'Sales']].rename(columns = {'Date': 'ds', 'Sales': 'y'})
  sales_df = sales_df.sort_values(by = 'ds')

  model = Prophet(holidays=holidays)
  model.fit(sales_df)
  future = model.make_future_dataframe(periods = periods)
  forecast = model.predict(future)
  figure1 = model.plot(forecast, xlabel = 'Data', ylabel = 'Vendas')
  figure2 = model.plot_components(forecast)

  return sales_df, forecast

In [None]:
sales_train_all_df.head()

In [None]:
school_holidays = sales_train_all_df[sales_train_all_df['SchoolHoliday'] == 1].loc[:, 'Date'].values
school_holidays.shape

In [None]:
school_holidays

In [None]:
len(np.unique(school_holidays))

In [None]:
state_holidays = sales_train_all_df[(sales_train_all_df['StateHoliday'] == 'a') | 
                                    (sales_train_all_df['StateHoliday'] == 'b') |
                                    (sales_train_all_df['StateHoliday'] == 'c')].loc[:,'Date'].values

In [None]:
state_holidays.shape

In [None]:
len(np.unique(state_holidays))

In [None]:
state_holidays = pd.DataFrame({'ds': pd.to_datetime(state_holidays),
                               'holiday': 'state_holiday'})
state_holidays

In [None]:
school_holidays = pd.DataFrame({'ds': pd.to_datetime(school_holidays),
                               'holiday': 'school_holiday'})
school_holidays

In [None]:
school_state_holidays = pd.concat((state_holidays, school_holidays))
school_state_holidays

In [None]:
df_original, df_prediction = sales_prediction(10, sales_train_all_df, school_state_holidays, 5)

In [None]:
df_prediction.head()