In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime

### Uploading first dataset

In [None]:
sales_train_df = pd.read_csv('../input/rossmann-store-sales/train.csv')

In [None]:
sales_train_df.head()

In [None]:
sales_train_df['Promo'].unique()

In [None]:
sales_train_df['SchoolHoliday'].unique()

In [None]:
sales_train_df['StateHoliday'].unique()

In [None]:
sales_train_df['Store'].unique()

In [None]:
sales_train_df.info()

In [None]:
sales_train_df.describe()

### UPLOADING STORE DATASET

In [None]:
store_df = pd.read_csv('../input/rossmann-store-sales/store.csv')

In [None]:
store_df.head()

In [None]:
store_df.info()

In [None]:
store_df.describe()

### EDA - SALES

In [None]:
sales_train_df.isnull().sum()

In [None]:
sales_train_df.hist(bins = 30, figsize=(20,20), color = 'r')

In [None]:
closed_train_df = sales_train_df[sales_train_df['Open'] == 0]
open_train_df = sales_train_df[sales_train_df['Open'] == 1]

In [None]:
print('Total = ', len(sales_train_df))
print('Number of shops/days OPEN = ', len(open_train_df))
print('Number of shops/days CLOSED = ', len(closed_train_df))

In [None]:
172817/ len(store_df)

In [None]:
# shops were closed for 155 days in the period of 2 years

In [None]:
sales_train_df = sales_train_df[sales_train_df['Open'] == 1]

In [None]:
sales_train_df.drop(['Open'], axis = 1, inplace = True)

In [None]:
sales_train_df.describe()

### EDA - STORES

In [None]:
sns.heatmap(store_df.isnull())

In [None]:
store_df[store_df['CompetitionDistance'].isnull()]

In [None]:
store_df[store_df['CompetitionOpenSinceMonth'].isnull()]

In [None]:
store_df[store_df['CompetitionOpenSinceYear'].isnull()]

In [None]:
store_df[store_df['Promo2'] == 0]

In [None]:
str_cols = ['Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval', 'CompetitionOpenSinceYear', 'CompetitionOpenSinceMonth']
for str in str_cols:
    store_df[str].fillna(0, inplace = True)

In [None]:
store_df['CompetitionDistance'].fillna(store_df['CompetitionDistance'].mean(), inplace=True)

In [None]:
sns.heatmap(store_df.isnull(), cbar = False)

In [None]:
store_df.hist(bins = 30, figsize=(20,20), color='g')

### MERGING SALES+STORE

In [None]:
sales_train_all = pd.merge(sales_train_df, store_df, how = 'inner', on = 'Store')

In [None]:
sales_train_all.shape

In [None]:
sales_train_all.head()

In [None]:
correlations = sales_train_all.corr()
f, ax = plt.subplots(figsize=(20,20))
sns.heatmap(correlations, annot = True)

In [None]:
correlations = sales_train_all.corr()['Sales'].sort_values(ascending=False)
correlations

In [None]:
sales_train_all['Day'] = pd.DatetimeIndex(sales_train_all['Date']).day
sales_train_all['Month'] = pd.DatetimeIndex(sales_train_all['Date']).month
sales_train_all['Year'] = pd.DatetimeIndex(sales_train_all['Date']).year

In [None]:
sales_train_all.head()

In [None]:
axis = sales_train_all.groupby('Month')[['Sales']].mean().plot(figsize=(10,5), marker = 'o')
axis.set_title('Average SALES per MONTH')

In [None]:
axis = sales_train_all.groupby('Month')[['Customers']].mean().plot(figsize=(10,5), marker = 'x', color = 'black')
axis.set_title('Average CUSTOMERS per MONTH')

In [None]:
axis = sales_train_all.groupby('Day')[['Sales']].mean().plot(figsize=(10,5), marker = 'o')
axis.set_title('Average SALES per DAY')

In [None]:
axis = sales_train_all.groupby('Day')[['Customers']].mean().plot(figsize=(10,5), marker = 'x', color = 'black')
axis.set_title('Average CUSTOMERS per DAY')

In [None]:
axis = sales_train_all.groupby('DayOfWeek')[['Sales']].mean().plot(figsize=(10,5), marker = 'o')
axis.set_title('Average SALES per WEEK DAY')

In [None]:
axis = sales_train_all.groupby('DayOfWeek')[['Customers']].mean().plot(figsize=(10,5), marker = 'x', color = 'black')
axis.set_title('Average CUSTOMERS per WEEK DAY')

In [None]:
fig, ax = plt.subplots(figsize=(20,10))
sales_train_all.groupby(['Date', 'StoreType']).mean()['Sales'].unstack().plot(ax = ax)

In [None]:
sns.barplot(x = 'Promo', y = 'Sales', data = sales_train_all)

In [None]:
sns.barplot(x = 'Promo', y = 'Customers', data = sales_train_all)

### TRAINING MODEL (Facebook Phophet)

In [None]:
from fbprophet import Prophet

In [None]:
# Date: ds - fbphopet pattern
# Sales: y - fbphopet pattern
def sales_predict(store_id, sales_df, periods):
    sales_df = sales_df[sales_df['Store'] == store_id]
    sales_df = sales_df[['Date', 'Sales']].rename(columns = {'Date': 'ds', 'Sales':'y'})
    sales_df = sales_df.sort_values(by = 'ds')
    
    model = Prophet()
    model.fit(sales_df)
    future = model.make_future_dataframe(periods = periods)
    forecast = model.predict(future)
    figure1 = model.plot(forecast, xlabel = 'Date', ylabel = 'Sales')
    figure2 = model.plot_components(forecast)
    
    return sales_df, forecast
    

In [None]:
df_original, df_prediction = sales_predict(10, sales_train_all, 60)

In [None]:
df_original.shape, df_prediction.shape

In [None]:
df_prediction.tail(60) #Sales prediction for the next 60 days.

### TRAINING MODEL 2 -> COUNTING HOLIDAYS

In [None]:
def sales_predict(store_id, sales_df, holidays, periods):
    sales_df = sales_df[sales_df['Store'] == store_id]
    sales_df = sales_df[['Date', 'Sales']].rename(columns = {'Date': 'ds', 'Sales':'y'})
    sales_df = sales_df.sort_values(by = 'ds')
    
    model = Prophet(holidays=holidays)
    model.fit(sales_df)
    future = model.make_future_dataframe(periods = periods)
    forecast = model.predict(future)
    figure1 = model.plot(forecast, xlabel = 'Date', ylabel = 'Sales')
    figure2 = model.plot_components(forecast)
    
    return sales_df, forecast
    

In [None]:
school_holidays = sales_train_all[sales_train_all['SchoolHoliday'] == 1].loc[:, 'Date'].values

In [None]:
state_holidays = sales_train_all[(sales_train_all['StateHoliday'] == 'a') |
                                (sales_train_all['StateHoliday'] == 'b') |
                                 (sales_train_all['StateHoliday'] == 'c')].loc[:, 'Date'].values

In [None]:
state_holidays = pd.DataFrame({'ds': pd.to_datetime(state_holidays),
                              'holiday': 'state_holiday'})

In [None]:
school_holidays = pd.DataFrame({'ds': pd.to_datetime(school_holidays),
                              'holiday': 'school_holiday'})

In [None]:
holidays_true = pd.concat((state_holidays, school_holidays))

### Sales prediction for Store id 10 for for the next 14 days.

In [None]:
df_original, df_predict = sales_predict(10, sales_train_all, holidays_true, 14)

In [None]:
prediction_df = df_original[['ds', 'y']].rename(columns = {'ds': 'Date', 'y':'Sales'}).tail(14)

In [None]:
#Final DF with the prediction
prediction_df