In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import datetime


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
sales_train_df = pd.read_csv('../input/rossmann-store-sales/train.csv')

In [None]:
sales_train_df.head(5)

In [None]:
sales_train_df.info()

In [None]:
sales_train_df.describe()

In [None]:
store_info_df = pd.read_csv('../input/rossmann-store-sales/store.csv')
store_info_df.head()

In [None]:
store_info_df.describe()


In [None]:
sns.heatmap(sales_train_df.isnull(), yticklabels=False, cbar= False, cmap= 'Blues');

In [None]:
sales_train_df.hist(bins = 30, figsize = (20,20), color = 'r')

In [None]:
sales_train_df['Customers'].max()

In [None]:
opened = sales_train_df[sales_train_df['Open'] == 1]
closed = sales_train_df[sales_train_df['Open'] == 0]

In [None]:
len(opened), len(closed)

In [None]:
sales_train_df = sales_train_df[sales_train_df['Open'] == 1]

In [None]:
# Let's drop the open column since it has no meaning now
sales_train_df.drop('Open', axis=1, inplace=True);

In [None]:
sales_train_df.describe()

In [None]:
sns.heatmap(store_info_df.isnull(), yticklabels=False, cbar=False, cmap='Blues');

In [None]:
store_info_df[store_info_df['CompetitionDistance'].isnull()]

In [None]:
store_info_df[store_info_df['CompetitionOpenSinceMonth'].isnull()]

In [None]:
store_info_df[store_info_df['Promo2'] == 0]

In [None]:
store_cols = ['Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval', 'CompetitionOpenSinceYear', 'CompetitionOpenSinceMonth']
for i in store_cols:
  store_info_df [i].fillna(0, inplace=True)

In [None]:
sns.heatmap(store_info_df.isnull(), cbar=False, yticklabels=False, cmap='Blues');

In [None]:
store_info_df['CompetitionDistance'].fillna(store_info_df['CompetitionDistance'].mean(), inplace=True);

In [None]:
sns.heatmap(store_info_df.isnull(), cbar=False, yticklabels=False, cmap='Blues');

In [None]:
store_info_df.hist(bins = 30, figsize = (20,20), color = 'r')


In [None]:
# Let's merge both data frames together based on 'store'
sales_train_all = pd.merge(sales_train_df, store_info_df, how='inner', on='Store')

In [None]:
correlations = sales_train_all.corr()['Sales'].sort_values()
correlations

In [None]:
correlations = sales_train_all.corr()
plt.figure(figsize=(20,20));
sns.heatmap(correlations, annot=True);

In [None]:
# Let's separate the year and put it into a separate column 
sales_train_all['Year'] = pd.DatetimeIndex(sales_train_all['Date']).year

In [None]:
# Let's do the same for the Day and Month
sales_train_all['Month'] = pd.DatetimeIndex(sales_train_all['Date']).month
sales_train_all['Day'] = pd.DatetimeIndex(sales_train_all['Date']).day

In [None]:
sales_train_all

In [None]:
# Let's take a look at the average sales and number of customers per month 
# 'groupby' works great by grouping all the data that share the same month column, then obtain the mean of the sales column  
# It looks like sales and number of customers peak around christmas timeframe
axis = sales_train_all.groupby('Month')[['Sales']].mean().plot(figsize = (10,5), marker = 'o', color = 'r')
axis.set_title('Average Sales Per Month')

plt.figure()
axis = sales_train_all.groupby('Month')[['Customers']].mean().plot(figsize = (10,5), marker = '^', color = 'b')
axis.set_title('Average Customers Per Month')

In [None]:
# Let's take a look at the sales and customers per day of the month instead
# Minimum number of customers are generally around the 24th of the month 
# Most customers and sales are around 30th and 1st of the month

ax = sales_train_all.groupby('Day')[['Sales']].mean().plot(figsize = (10,5), marker = 'o', color = 'r')
axis.set_title('Average Sales Per Day')

plt.figure()
ax = sales_train_all.groupby('Day')[['Customers']].mean().plot(figsize = (10,5), marker = '^', color = 'b')
axis.set_title('Average Sales Per Day')

In [None]:
# Let's do the same for the day of the week  (note that 7 = Sunday)

axis = sales_train_all.groupby('DayOfWeek')[['Sales']].mean().plot(figsize = (10,5), marker = 'o', color = 'r')
axis.set_title('Average Sales Per Day of the Week')

plt.figure()
axis = sales_train_all.groupby('DayOfWeek')[['Customers']].mean().plot(figsize = (10,5), marker = '^', color = 'b')
axis.set_title('Average Customers Per Day of the Week')

In [None]:
fig, ax = plt.subplots(figsize=(20,10))
sales_train_all.groupby(['Date','StoreType']).mean()['Sales'].unstack().plot(ax=ax)

In [None]:
plt.figure(figsize=[15,10])

plt.subplot(211)
sns.barplot(x = 'Promo', y = 'Sales', data = sales_train_all)

plt.subplot(212)
sns.barplot(x = 'Promo', y = 'Customers', data = sales_train_all)

In [None]:
plt.figure(figsize=[15,10])

plt.subplot(211)
sns.violinplot(x = 'Promo', y = 'Sales', data = sales_train_all)

plt.subplot(212)
sns.violinplot(x = 'Promo', y = 'Customers', data = sales_train_all)

In [None]:
# import prophet 
!pip install fbprophet
from fbprophet import Prophet

In [None]:
def sales_prediction(Store_ID, sales_df, periods):
  # Function that takes in the data frame, storeID, and number of future period forecast
  # The function then generates date/sales columns in Prophet format
  # The function then makes time series predictions

  sales_df = sales_df[ sales_df['Store'] == Store_ID ]
  sales_df = sales_df[['Date', 'Sales']].rename(columns = {'Date': 'ds', 'Sales':'y'})
  sales_df = sales_df.sort_values('ds')
  
  model    = Prophet()
  model.fit(sales_df)
  future   = model.make_future_dataframe(periods=periods)
  forecast = model.predict(future)
  figure   = model.plot(forecast, xlabel='Date', ylabel='Sales')
  figure2  = model.plot_components(forecast)

In [None]:
sales_prediction(10, sales_train_all, 60)

In [None]:
#Including Holidays
def sales_prediction(Store_ID, sales_df, holidays, periods):
  # Function that takes in the storeID and returns two date/sales columns in Prophet format
  # Format data to fit prophet 

  sales_df = sales_df[ sales_df['Store'] == Store_ID ]
  sales_df = sales_df[['Date', 'Sales']].rename(columns = {'Date': 'ds', 'Sales':'y'})
  sales_df = sales_df.sort_values('ds')
  
  model    = Prophet(holidays = holidays)
  model.fit(sales_df)
  future   = model.make_future_dataframe(periods = periods)
  forecast = model.predict(future)
  figure   = model.plot(forecast, xlabel='Date', ylabel='Sales')
  figure2  = model.plot_components(forecast)

In [None]:
# Get all the dates pertaining to school holidays 
school_holidays = sales_train_all[sales_train_all['SchoolHoliday'] == 1].loc[:, 'Date'].values
school_holidays.shape

In [None]:
# Get all the dates pertaining to state holidays 
state_holidays = sales_train_all [ (sales_train_all['StateHoliday'] == 'a') | (sales_train_all['StateHoliday'] == 'b') | (sales_train_all['StateHoliday'] == 'c')  ].loc[:, 'Date'].values
state_holidays.shape

In [None]:
state_holidays = pd.DataFrame({'ds': pd.to_datetime(state_holidays),
                               'holiday': 'state_holiday'})

In [None]:
state_holidays

In [None]:
school_holidays = pd.DataFrame({'ds': pd.to_datetime(school_holidays),
                                'holiday': 'school_holiday'})

In [None]:
school_state_holidays = pd.concat((school_holidays, state_holidays))

In [None]:
school_state_holidays

In [None]:
sales_prediction(6, sales_train_all, school_state_holidays, 60)