# Rossaman Sales prediction with facebook Prophet

## The Problem
We have sales data from several stores and we want to predict sales in order to improve our stock logistics. 

## The solution 
Analyse data and perform a sales prediction study using facebook prophet. 

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import datetime

In [None]:
train_data = pd.read_csv('../input/rossmann-store-sales/train.csv', parse_dates = True, low_memory = False)
train_data.head()

In [None]:
train_data.info()

Getting to know how the data is classified:

In [None]:
train_data.shape

In [None]:
train_data['DayOfWeek'].unique()


In [None]:
train_data['Open'].unique()

In [None]:
train_data['Promo'].unique()

In [None]:
train_data['StateHoliday'].unique()

a= Public Holiday
b= Easter
c= Christmas

In [None]:
train_data['SchoolHoliday'].unique()

In [None]:
train_data.describe()

In [None]:
store_data = pd.read_csv('../input/rossmann-store-sales/store.csv')
store_data.head()

In [None]:
store_data.shape

In [None]:
store_data.info()

In [None]:
store_data.describe()

To simplify further analysis, let's convert CompetitionDistance variable:

In [None]:
store_data['CompetitionDistance'] = store_data['CompetitionDistance']/1000
store_data.head()

## Exploratory data analysis

### EDA for Train Dataset

Creating a heatmap for the sales train data 

In [None]:
train_data.isnull().sum()

In [None]:
plt.figure(figsize = (10,5))
sns.heatmap(train_data.isnull());

There are no null values that must be treated. 

Now we will visualize graphically the variable contained in the train_data dataset

In [None]:
train_data.hist(bins = 25, figsize=(18, 18), color = 'c');

The most important information from these plots are:
- Most of the sales are happening around the value of 5 to 6 thousand euros
- Number of clients per day shows a greater frequency around 600
- The store was on sale (variable Promo) around 40% of the time it was open

Getting some important information quantitatively:

In [None]:
max_clients = train_data['Customers'].max()
ctrain_data = train_data[train_data['Open'] == 0]
otrain_data = train_data[train_data['Open'] == 1]

print("The maximum number of clients per day was {}.".format(max_clients))

#Number of registers classified as 'opened store' or 'closed store'
ctrain_data = train_data[train_data['Open'] == 0]
otrain_data = train_data[train_data['Open'] == 1]
n_closed = round(len(ctrain_data)/len(store_data),0)
n_open = round(len(otrain_data)/len(store_data),0)

print("The number of days with closed stores was {} and the number of days with open stores was {}".format(n_closed, n_open))



Now we will focus on the dataframe in which we have only registers from when the stores are open

In [None]:
train_data.head()

In [None]:
train_data.drop(['Open'], axis = 1, inplace=True)
train_data

In [None]:
#statistics for the new dataframe
train_data.describe()

We can see from this new analysis that, when we deleted the registers for the days the store was closed, the statistics changed. 

Before, with all the data points, the mean for the sales was of around 5773 now it is 6955. The mean of the number of customers is another important variable that changed, going from 622 to 762 now. 

### EDA for Stores Dataset

In [None]:
store_data.isnull().sum()

In [None]:
plt.figure(figsize = (10,5))
sns.heatmap(store_data.isnull());

Checking the missing data:

In [None]:
store_data[store_data['CompetitionDistance'].isnull()]

In [None]:
store_data[store_data['CompetitionOpenSinceMonth'].isnull()]

In [None]:
store_data[store_data['CompetitionOpenSinceYear'].isnull()]

In [None]:
store_data[store_data['Promo2'] == 0]

#### Filling the missing numbers:
We have missing numbers in variables related to dates (weeks, years), so we have discrete points instead of a continuous variable that could have missing numbers filled by the mean, for example. 

In [None]:
miss_cols = ['Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval',
            'CompetitionOpenSinceYear', 'CompetitionOpenSinceMonth']
for str in miss_cols:
  store_data[str].fillna(0, inplace=True)

In [None]:
plt.figure(figsize = (10,5))
sns.heatmap(store_data.isnull(), cbar = False);

Now we do not have null data, except for the variable CompetitionDistance that must be treated now. In this case we have a continuous variable, as it is represented by the distance between the store and its competitors. We can fill missing numbers using the mean, as seen below: 

In [None]:
store_data['CompetitionDistance'].fillna(store_data['CompetitionDistance'].mean(), inplace = True)

In [None]:
plt.figure(figsize = (10,5))
sns.heatmap(store_data.isnull(), cbar = False);

In [None]:
#### Now the dataset is free from missing data. 

Now we will visualize graphically the variables contained in the stores dataframe

In [None]:
store_data.hist(bins = 25, figsize=(18,18), color = 'm');

#### Main points from this data:
- Most of the store's competitors are within 10 km distance
- Around half of the stores take part in the aditional sales campaign


### Merging the dataframes Store and Train

We will use the store id ('Store' column) as a common variable between both dataframes

In [None]:
train_data.head()

In [None]:
store_data.head()

In [None]:
merged_data = pd.merge(train_data, store_data, how = 'inner', on = 'Store')
merged_data.head()

In [None]:
merged_data.info()

In [None]:
merged_data.shape

Now let's visualize the correlations between variables

In [None]:
correlations = merged_data.corr()
f, ax = plt.subplots(figsize=(16,10))
sns.heatmap(correlations, annot = True);

#### Some conclusions from the correlation matrix: 
- The obvious correlations are: 
    - Sales are strongly correlated to the 'customers'variable 
    - Sales are moderately correlated to the 'promo' variable, meaning that promotions may have an impact on sales but it is not that much strong. 
- Some interesting correlations are: 
    - Additional promotions (variable 'Promo2') does not have a great impact on the volume of sales as the correlation is of -0,091. Sometimes we may expect that more promotions are what is needed to increase sales but that is not always true. 

In [None]:
#Another way to see the correlations, tin this case, related to sales. 
correlations = merged_data.corr()['Sales'].sort_values()
correlations

In [None]:
#Related to number of customers:
correlations2 = merged_data.corr()['Customers'].sort_values()
correlations2

### Visualising the data with time series

In [None]:
merged_data['Year'] = pd.DatetimeIndex(merged_data['Date']).year
merged_data['Month'] = pd.DatetimeIndex(merged_data['Date']).month
merged_data['Day'] = pd.DatetimeIndex(merged_data['Date']).day
merged_data.head()

#### Monthly Analysis

In [None]:
axis = merged_data.groupby('Month')[['Sales']].mean().plot(figsize = (16,5), marker = 'o', color = 'g')
axis.set_title('Average sales per month')

In [None]:
axis = merged_data.groupby('Month')[['Customers']].mean().plot(figsize = (16,5), marker = 'o', color = 'm')
axis.set_title('Average clients per month')

- Clearly we can see an influence from Christmas in the the sales, with peak of number of clients in December. 
- Another small peak appears around the month of July, when it is summer vacations and probably there are more people going to the shops. 
- Around Janurary and February the number of clients is low, and this may be explained by the fact that these are the winter months after christmas. 

#### Daily Analysis

In [None]:
axis = merged_data.groupby('Day')[['Sales']].mean().plot(figsize = (16,5), marker = 'o', color = 'r')
axis.set_title('Average Sales per Day')

In [None]:
axis = merged_data.groupby('Day')[['Customers']].mean().plot(figsize = (16,5), marker = 'o', color = 'c')
axis.set_title('Average clients per day')

#### Day of the Week analysis

In [None]:
axis = merged_data.groupby('DayOfWeek')[['Sales']].mean().plot(figsize = (16,5), marker = 'o', color = 'r')
axis.set_title('Average sales per day of the week')

In [None]:
axis = merged_data.groupby('DayOfWeek')[['Customers']].mean().plot(figsize = (16,5), marker = 'o', color = 'k')
axis.set_title('Average customers per day of the week')

In [None]:
fig, ax = plt.subplots(figsize = (20,10))
merged_data.groupby(['Date', 'StoreType']).mean()['Sales'].unstack().plot(ax = ax);

In [None]:
sns.barplot(x = 'Promo', y = 'Sales', data = merged_data);

In [None]:
sns.barplot(x = 'Promo', y = 'Customers', data = merged_data);

## Predicting sales with Facebook Prophet

In [None]:
!pip install fbprophet

In [None]:
from fbprophet import Prophet

In [None]:
def sales_prediction(store_id, sales_data, periods):
  sales_data = sales_data[sales_data['Store'] == store_id]
  sales_data = sales_data[['Date', 'Sales']].rename(columns = {'Date': 'ds', 'Sales': 'y'})
  sales_data = sales_data.sort_values(by = 'ds')

  model = Prophet()
  model.fit(sales_data)
  future = model.make_future_dataframe(periods = periods)
  forecast = model.predict(future)
  fig1 = model.plot(forecast, xlabel = 'Date', ylabel = 'Sales')
  fig2 = model.plot_components(forecast)

  return sales_data, forecast

In [None]:
df_origin, df_prediction = sales_prediction(10, merged_data, 90)

In [None]:
df_origin.shape, df_prediction.shape

In [None]:
df_prediction.head()

In [None]:
df_prediction.tail(60)

In [None]:
df_prediction.tail(60).to_csv('sales_predictions.csv')

In [None]:
df_origin.tail()

In [None]:
def sales_prediction(store_id, sales_data, holidays, periods):
  sales_data = sales_data[sales_data['Store'] == store_id]
  sales_data = sales_data[['Date', 'Sales']].rename(columns = {'Date': 'ds', 'Sales': 'y'})
  sales_data = sales_data.sort_values(by = 'ds')

  model = Prophet(holidays=holidays)
  model.fit(sales_data)
  future = model.make_future_dataframe(periods = periods)
  forecast = model.predict(future)
  fig1 = model.plot(forecast, xlabel = 'Date', ylabel = 'Sales')
  fig2 = model.plot_components(forecast)

  return sales_data, forecast

In [None]:
merged_data.head()

In [None]:
school_holidays = merged_data[merged_data['SchoolHoliday'] == 1].loc[:, 'Date'].values
school_holidays.shape

#### *In which dates there are school holidays?*

In [None]:
school_holidays

In [None]:
len(np.unique(school_holidays))

In [None]:
state_holidays = merged_data[(merged_data['StateHoliday'] == 'a') | 
                                    (merged_data['StateHoliday'] == 'b') |
                                    (merged_data['StateHoliday'] == 'c')].loc[:,'Date'].values

In [None]:
state_holidays.shape

In [None]:
len(np.unique(state_holidays))

In [None]:
state_holidays = pd.DataFrame({'ds': pd.to_datetime(state_holidays),
                               'holiday': 'state_holiday'})
state_holidays

In [None]:
school_holidays = pd.DataFrame({'ds': pd.to_datetime(school_holidays),
                               'holiday': 'school_holiday'})
school_holidays

In [None]:
school_state_holidays = pd.concat((state_holidays, school_holidays))
school_state_holidays

In [None]:
df_original, df_prediction = sales_prediction(10, merged_data, school_state_holidays, 5)

In [None]:
df_prediction.head()