In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input/favorita-grocery-sales-forecasting'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install py7zr

In [None]:
import py7zr
from subprocess import check_output

for dirname, _, filenames in os.walk('/kaggle/input/favorita-grocery-sales-forecasting'):
    for filename in filenames:
        archive = py7zr.SevenZipFile(os.path.join(dirname, filename), mode='r')
        archive.extractall(path="/kaggle/working")
        archive.close()

print(check_output(["ls", "../working"]).decode("utf8"))

# EDA

In [None]:
# Importing the relevant libraries
import IPython.display
import json
import pandas as pd
import seaborn as sns
import squarify
%matplotlib inline
import missingno as msno
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import numpy as np
from matplotlib import pyplot as plt
color = sns.color_palette()

# D3 modules
from IPython.core.display import display, HTML, Javascript
from string import Template

## Explore and prepare the data

In [None]:
items = pd.read_csv("../working/items.csv")
holiday_events = pd.read_csv("../working/holidays_events.csv", parse_dates=['date'])
stores = pd.read_csv("../working/stores.csv")
oil = pd.read_csv("../working/oil.csv", parse_dates=['date'])
transactions = pd.read_csv("../working/transactions.csv", parse_dates=['date'])
# the full training data's output: "125,497,040 rows | 6 columns"
#Therefore I will only load approx 5% of the data just to get a rough idea of what is in store for us.
train = pd.read_csv("../working/train.csv", nrows=6000000  , parse_dates=['date'])
train_large = pd.read_csv('../working/train.csv', skiprows = 115000000, names = train.columns, parse_dates = ['date'])


In [None]:
train.head()

In [None]:
print("Nulls in Oil columns: {0} => {1}".format(oil.columns.values,oil.isnull().any().values))
print("="*70)
print("Nulls in holiday_events columns: {0} => {1}".format(holiday_events.columns.values,holiday_events.isnull().any().values))
print("="*70)
print("Nulls in stores columns: {0} => {1}".format(stores.columns.values,stores.isnull().any().values))
print("="*70)
print("Nulls in transactions columns: {0} => {1}".format(transactions.columns.values,transactions.isnull().any().values))

 The only missing data occurs in the oil data file, which provides the historical daily price for oil.

## Oil Data

In [None]:
oil.head(3)

In [None]:
 trace = go.Scatter(
     name='Oil prices',
     x=oil['date'],
     y=oil['dcoilwtico'].dropna(),
     mode='lines',
     line=dict(color='rgb(20, 15, 200, 0.8)'),
     #fillcolor='rgba(68, 68, 68, 0.3)',
     fillcolor='rgba(0, 0, 216, 0.3)',
     fill='tonexty' )

 data = [trace]

 layout = go.Layout(
     yaxis=dict(title='Daily Oil price'),
     title='Daily oil prices from Jan 2013 till July 2017',
     showlegend = False)
 fig = go.Figure(data=data, layout=layout)
 py.iplot(fig, filename='pandas-time-series-error-bars')

From January 2013 to July 2017, this graph demonstrates that the daily oil price has been on a declining trend. Whereas the price of oil began 2013 by rising and even breaking the $100 barrier for a few months in 2013, the price of oil began to plummet in the middle of 2014, resulting in a significant decline in the price of oil. This trend appears to be true based on some quick open-source research (i.e. Googling), as oil prices were relatively stable from 2010 to mid-2014, after which they drastically fell (due to a confluence of factors including weak demand due to poor economic growth and surging alternative crude oil sources such as shale/tar sands).

## Stores Data

In [None]:
stores.head(3)

In [None]:
temp=transactions.groupby(['store_nbr']).agg({'date':[np.min,np.max]}).reset_index()
temp['store_age']=temp['date']['amax']-temp['date']['amin']
temp['open_year']=temp['date']['amin'].dt.year
data=temp['open_year'].value_counts()
plt.figure(figsize=(12,4))
sns.barplot(data.index,data.values, alpha=0.8, color=color[0])
plt.ylabel('Stores', fontsize=12)
plt.xlabel('Store opening Year', fontsize=12)
plt.title('When were the stores started?', fontsize=15)
plt.xticks(rotation='vertical')
plt.show()

5 Stores were opened in 2015 and 1 each in 2014 and 2017

In [None]:
fig = plt.figure(figsize=(25, 21))
marrimeko=stores.city.value_counts().to_frame()
ax = fig.add_subplot(111, aspect="equal")
ax = squarify.plot(sizes=marrimeko['city'].values,label=marrimeko.index,
              color=sns.color_palette('cubehelix_r', 28), alpha=1)
ax.set_xticks([])
ax.set_yticks([])
fig=plt.gcf()
fig.set_size_inches(40,25)
plt.title("Treemap of store counts across different cities", fontsize=18)
plt.show();

In [None]:
fig = plt.figure(figsize=(25, 21))
marrimeko=stores.state.value_counts().to_frame()
ax = fig.add_subplot(111, aspect="equal")
ax = squarify.plot(sizes=marrimeko['state'].values,label=marrimeko.index,
              color=sns.color_palette('viridis_r', 28), alpha=1)
ax.set_xticks([])
ax.set_yticks([])
fig=plt.gcf()
fig.set_size_inches(40,25)
plt.title("Treemap of store counts across different States", fontsize=18)
plt.show()

In [None]:
stores.state.unique()

Our store numbers will now be arranged against their corresponding shop clusters, allowing us to see whether there are any obvious trends or links in the data. To do so, I'll use the groupby and pivot statements to group our stores Python dataframe based on the values "store nbr" and "cluster." Then I'll unstack the grouping by pivoting on the level of store nbr index labels, yielding a DataFrame with a new level of columns that are the store clusters whose inner-most level pertain to the pivoted store nbr index labels. This method is widely used in Python to create stacked barplots, but because we just have unique store nbr numbers, we'll just receive barplots of store numbers arranged by their relevant clusters.

In [None]:
neworder = [23, 24, 26, 36, 41, 15, 29, 31, 32, 34, 39, 
            53, 4, 37, 40, 43, 8, 10, 19, 20, 33, 38, 13, 
            21, 2, 6, 7, 3, 22, 25, 27, 28, 30, 35, 42, 44, 
            48, 51, 16, 0, 1, 5, 52, 45, 46, 47, 49, 9, 11, 12, 14, 18, 17, 50]

In [None]:
# Finally plot the seaborn heatmap
plt.style.use('dark_background')
plt.figure(figsize=(15,12))
store_pivot = stores.dropna().pivot("store_nbr","cluster", "store_nbr")
ax = sns.heatmap(store_pivot, cmap='jet', annot=True, linewidths=0, linecolor='white')
plt.title('Store numbers and the clusters they are assigned to')

In [None]:
 plt.style.use('seaborn-white')
 nbr_cluster = stores.groupby(['store_nbr','cluster']).size()
 nbr_cluster.unstack().iloc[neworder].plot(kind='bar',stacked=True, colormap= 'tab20', figsize=(13,11),  grid=False)
 plt.title('Store numbers and the clusters they are assigned to', fontsize=14)
 plt.ylabel('')
 plt.xlabel('Store number')
 plt.show()

From visualising the store numbers side-by-side based on the clustering, we can identify certain patterns. 
For example clusters 3, 6, 10 and 15 are the most common store clusters based off the fact that there are more store_nbrs attributed to them then the others 
while on the other end of the spectrum, we have clusters 5 and 17 which are only related to the stores 44 and 51 respectively.

we can look at the distribution of clusters based on the store type to see if we can identify any apparent relationship between types and the way the company has decided to cluster the particular store.

In [None]:
plt.style.use('seaborn-white')
type_cluster = stores.groupby(['type','cluster']).size()
type_cluster.unstack().plot(kind='bar',stacked=True, colormap= 'PuBu', figsize=(13,11),  grid=False)
plt.title('Stacked Barplot of Store types and their cluster distribution', fontsize=18)
plt.ylabel('Count of clusters in a particular store type', fontsize=16)
plt.xlabel('Store type', fontsize=16)
plt.show()

In [None]:
plt.style.use('seaborn-white')
city_cluster = stores.groupby(['city','type']).store_nbr.size()
city_cluster.unstack().plot(kind='bar',stacked=True, colormap= 'viridis', figsize=(13,11),  grid=False)
plt.title('Stacked Barplot of Store types opened for each city')
plt.ylabel('Count of stores for a particular city')
plt.show()

Guayaquil and Quito are two cities that stand out in terms of the range of retail kinds available. These are unsurprising given that Quito is Ecuador's capital and Guayaquil is the country's largest and most populated metropolis. As a result, one might expect Corporacion Favorita to target these major cities with the most diverse store types, as evidenced by the highest counts of store nbrs attributed to those two cities.

## Holiday Events Data

In [None]:
holiday_events.head(3)

In [None]:
#does the transactions peak at holiday events?
plt.figure(figsize=(12,12))
plt.plot(transactions.rolling(window=30,center=False).mean(),label='Rolling Mean');
plt.plot(transactions.rolling(window=30,center=False).std(),label='Rolling sd');
plt.legend();

In [None]:
plt.style.use('seaborn-white')
holiday_local_type = holiday_events.groupby(['locale_name', 'type']).size()
holiday_local_type.unstack().plot(kind='bar',stacked=True, colormap= 'magma_r', figsize=(12,10),  grid=False)
plt.title('Stacked Barplot of locale name against event type')
plt.ylabel('Count of entries')
plt.show()

### D3.js visualization library

In [None]:
 with pd.option_context('display.max_rows', None, 'display.max_columns', 3):
     print(holiday_events[['type','description']].apply(pd.Series.value_counts))

In [None]:
# Prepping the json file
holiday_json = {
"name": "flare",
"children": [
{
"name": "Additional",
"children":[
{"name": "Batalla de Pichincha",       "size": 5.0},
{"name": "Cantonizacion de Cayambe",   "size": 6.0},
{"name": "Cantonizacion de El Carmen", "size": 6.0},
{"name": "Cantonizacion de Guaranda",  "size": 6.0},
{"name": "Cantonizacion de Latacunga", "size": 6.0},
{"name": "Cantonizacion de Libertad",  "size": 6.0},
{"name": "Cantonizacion de Quevedo",   "size": 6.0},
{"name": "Cantonizacion de Riobamba",  "size": 6.0},
{"name": "Cantonizacion de Salinas",   "size": 6.0},
{"name": "Cantonizacion del Puyo",     "size": 6.0},
{"name": "Carnaval",                   "size": 0.0},
{"name": "Dia de Difuntos",            "size": 6.0},
{"name": "Dia de la Madre",            "size": 5.0},
{"name": "Dia de la Madre-1",          "size": 5.0},
{"name": "Dia del Trabajo",             "size": 5.0},
{"name": "Fundacion de Guayaquil",    "size": 5.0},
{"name": "Fundacion de Guayaquil-1",  "size": 5.0},
{"name": "Fundacion de Quito",        "size": 6.0},
{"name": "Fundacion de Quito-1",      "size": 6.0},
{"name": "Navidad+1                                      ", "size": 6.0},
{"name": "Navidad-1                                      ", "size": 6.0},
{"name": "Navidad-2                                      ", "size": 6.0},
{"name": "Navidad-3                                      ", "size": 6.0},
{"name": "Navidad-4                                      ", "size": 6.0},
]
},
{
"name":  "Holiday",
"children":[
{"name": "Fundacion de Ambato",       "size": 6.0},
{"name": "Fundacion de Cuenca",       "size": 7.0},
{"name": "Fundacion de Esmeraldas",   "size": 6.0},
{"name": "Fundacion de Ibarra",       "size": 7.0},
{"name": "Fundacion de Loja",         "size": 6.0},
{"name": "Fundacion de Machala",      "size": 6.0},
{"name": "Fundacion de Manta",        "size": 6.0},
{"name": "Fundacion de Riobamba",     "size": 6.0},
{"name": "Fundacion de Santo Domingo", "size": 6.0}
]
},
{
"name": "Event",
"children": [
{"name": "Inauguracion Mundial de futbol Brasil          ", "size": 1.0},
{"name": "Independencia de Ambato                        ", "size": 6.0},
{"name": "Independencia de Cuenca                        ", "size": 6.0},
{"name": "Independencia de Guaranda                      ", "size": 6.0},
{"name": "Independencia de Guayaquil                     ", "size": 6.0},
{"name": "Independencia de Latacunga                     ", "size": 6.0},
{"name": "Mundial de futbol Brasil: Cuartos de Final     ", "size": 2.0},
{"name": "Mundial de futbol Brasil: Ecuador-Francia      ", "size": 1.0},
{"name": "Mundial de futbol Brasil: Ecuador-Honduras     ", "size": 1.0},
{"name": "Mundial de futbol Brasil: Ecuador-Suiza        ", "size": 1.0},
{"name": "Mundial de futbol Brasil: Final                ", "size": 1.0},
{"name": "Mundial de futbol Brasil: Octavos de Final     ", "size": 4.0},
{"name": "Mundial de futbol Brasil: Semifinales          ", "size": 2.0},
{"name": "Mundial de futbol Brasil: Tercer y cuarto lugar", "size": 1.0},
{"name": "Navidad                                        ", "size": 6.0},
{"name": "Primer Grito de Independencia                  ", "size": 6.0},
{"name": "Primer dia del ano                             ", "size": 5.0},
{"name": "Primer dia del ano-1                           ", "size": 5.0},
{"name": "Black Friday",               "size": 3.0},
{"name": "Cyber Monday",               "size": 3.0},
{"name": "Provincializacion Santa Elena                  ", "size": 6.0},
{"name": "Provincializacion de Cotopaxi                  ", "size": 6.0},
{"name": "Provincializacion de Imbabura                  ", "size": 6.0},
{"name": "Provincializacion de Santo Domingo             ", "size": 6.0},
{"name": "Terremoto Manabi                               ", "size": 1.0},
{"name": "Terremoto Manabi+1                             ", "size": 1.0},
{"name": "Terremoto Manabi+10                            ", "size": 1.0},
{"name": "Terremoto Manabi+11                            ", "size": 1.0},
{"name": "Terremoto Manabi+12                            ", "size": 1.0},
{"name": "Terremoto Manabi+13                            ", "size": 1.0},
{"name": "Terremoto Manabi+14                            ", "size": 1.0},
{"name": "Terremoto Manabi+15                            ", "size": 1.0},
{"name": "Terremoto Manabi+16                            ", "size": 1.0},
{"name": "Terremoto Manabi+17                            ", "size": 1.0},
{"name": "Terremoto Manabi+18                            ", "size": 1.0},
{"name": "Terremoto Manabi+19                            ", "size": 1.0},
{"name": "Terremoto Manabi+2                             ", "size": 1.0},
{"name": "Terremoto Manabi+20                            ", "size": 1.0},
{"name": "Terremoto Manabi+21                            ", "size": 1.0},
{"name": "Terremoto Manabi+22                            ", "size": 1.0},
{"name": "Terremoto Manabi+23                            ", "size": 1.0},
{"name": "Terremoto Manabi+24                            ", "size": 1.0},
{"name": "Terremoto Manabi+25                            ", "size": 1.0},
{"name": "Terremoto Manabi+26                            ", "size": 1.0},
{"name": "Terremoto Manabi+27                            ", "size": 1.0},
{"name": "Terremoto Manabi+28                            ", "size": 1.0},
{"name": "Terremoto Manabi+29                            ", "size": 1.0},
{"name": "Terremoto Manabi+3                             ", "size": 1.0},
{"name": "Terremoto Manabi+30                            ", "size": 1.0},
{"name": "Terremoto Manabi+4                             ", "size": 1.0},
{"name": "Terremoto Manabi+5                             ", "size": 1.0},
{"name": "Terremoto Manabi+6                             ", "size": 1.0},
{"name": "Terremoto Manabi+7                             ", "size": 1.0},
{"name": "Terremoto Manabi+8                             ", "size": 1.0},
{"name": "Terremoto Manabi+9                             ", "size": 1.0}
]
},
{
"name": "Transfer",
"children":[
{"name": "Traslado Batalla de Pichincha         ", "size": 2.0},
{"name": "Traslado Fundacion de Guayaquil       ", "size": 1.0},
{"name": "Traslado Fundacion de Quito           ", "size": 1.0},
{"name": "Traslado Independencia de Guayaquil   ", "size": 3.0},
{"name": "Traslado Primer Grito de Independencia", "size": 2.0},
{"name": "Traslado Primer dia del ano           ", "size": 1.0},
{"name": "Viernes Santo                         ", "size": 5.0}
]
},
    {
"name": "Bridge",
"children":[
{"name": "Puente Dia de Difuntos                         ", "size": 1.0},
{"name": "Puente Navidad                                 ", "size": 2.0},
{"name": "Puente Primer dia del ano                      ", "size": 2.0},
]
},
{
"name": "Work Day",
"children":[
    {"name": "Recupero puente Navidad", "size": 2.0},
    {"name": "ecupero puente primer dia del ano", "size": 2.0},
    {"name": "Recupero Puente Navidad", "size": 2.0},
    {"name": "Recupero Puente Primer dia del ano", "size": 2.0},
    {"name": "Recupero Puente Dia de Difuntos", "size": 2.0}
]
}
] 
} 

In [None]:
# dumping the holiday_events data into a json file
with open('output.json', 'w') as outfile:  
    json.dump(holiday_json, outfile)
pd.read_json('output.json').head()

#Embedding the html string
html_string = """
<!DOCTYPE html>
<meta charset="utf-8">
<style>

.node {
  cursor: pointer;
}

.node:hover {
  stroke: #000;
  stroke-width: 1.5px;
}

.node--leaf {
  fill: white;
}

.label {
  font: 11px "Helvetica Neue", Helvetica, Arial, sans-serif;
  text-anchor: middle;
  text-shadow: 0 1px 0 #fff, 1px 0 0 #fff, -1px 0 0 #fff, 0 -1px 0 #fff;
}

.label,
.node--root,
.node--leaf {
  pointer-events: none;
}

</style>
<svg width="760" height="760"></svg>
"""

In [None]:
# Finally embed the D3.js to produce the circular treemap
js_string="""
 require.config({
    paths: {
        d3: "https://d3js.org/d3.v4.min"
     }
 });

  require(["d3"], function(d3) {

   console.log(d3);

var svg = d3.select("svg"),
    margin = 20,
    diameter = +svg.attr("width"),
    g = svg.append("g").attr("transform", "translate(" + diameter / 2 + "," + diameter / 2 + ")");

var color = d3.scaleSequential(d3.interpolateViridis)
    .domain([-4, 4]);

var pack = d3.pack()
    .size([diameter - margin, diameter - margin])
    .padding(2);

d3.json("output.json", function(error, root) {
  if (error) throw error;

  root = d3.hierarchy(root)
      .sum(function(d) { return d.size; })
      .sort(function(a, b) { return b.value - a.value; });

  var focus = root,
      nodes = pack(root).descendants(),
      view;

  var circle = g.selectAll("circle")
    .data(nodes)
    .enter().append("circle")
      .attr("class", function(d) { return d.parent ? d.children ? "node" : "node node--leaf" : "node node--root"; })
      .style("fill", function(d) { return d.children ? color(d.depth) : null; })
      .on("click", function(d) { if (focus !== d) zoom(d), d3.event.stopPropagation(); });

  var text = g.selectAll("text")
    .data(nodes)
    .enter().append("text")
      .attr("class", "label")
      .style("fill-opacity", function(d) { return d.parent === root ? 1 : 0; })
      .style("display", function(d) { return d.parent === root ? "inline" : "none"; })
      .text(function(d) { return d.data.name; });

  var node = g.selectAll("circle,text");

  svg
      .style("background", color(-1))
      .on("click", function() { zoom(root); });

  zoomTo([root.x, root.y, root.r * 2 + margin]);

  function zoom(d) {
    var focus0 = focus; focus = d;

    var transition = d3.transition()
        .duration(d3.event.altKey ? 7500 : 750)
        .tween("zoom", function(d) {
          var i = d3.interpolateZoom(view, [focus.x, focus.y, focus.r * 2 + margin]);
          return function(t) { zoomTo(i(t)); };
        });

    transition.selectAll("text")
      .filter(function(d) { return d.parent === focus || this.style.display === "inline"; })
        .style("fill-opacity", function(d) { return d.parent === focus ? 1 : 0; })
        .on("start", function(d) { if (d.parent === focus) this.style.display = "inline"; })
        .on("end", function(d) { if (d.parent !== focus) this.style.display = "none"; });
  }

  function zoomTo(v) {
    var k = diameter / v[2]; view = v;
    node.attr("transform", function(d) { return "translate(" + (d.x - v[0]) * k + "," + (d.y - v[1]) * k + ")"; });
    circle.attr("r", function(d) { return d.r * k; });
  }
});
  });
 """

In [None]:
h = display(HTML(html_string))
j = IPython.display.Javascript(js_string)
IPython.display.display_javascript(j)

In [None]:
holiday_events.type.unique()

## Transactions data

In [None]:
transactions.head(3)

In [None]:
plt.style.use('seaborn-white')
plt.figure(figsize=(13,11))
plt.plot(transactions.date.values, transactions.transactions.values, color='darkblue')
plt.ylim(-50, 10000)
plt.title("Distribution of transactions per day from 2013 till 2017")
plt.ylabel('transactions per day', fontsize= 16)
plt.xlabel('Date', fontsize= 16)
plt.show()

The bigger yearly periodic spike in transactions seem to occur at the end of the year in December. Perhaps this is due to some sort of Christmas sale/discount that Corporacion Favorita holds every December.

In [None]:
#transactions
# month over month sales
transactions['date']=pd.to_datetime(transactions['date'])
temp=transactions.groupby(['date']).aggregate({'store_nbr':'count','transactions':np.sum})
temp=temp.reset_index()
temp_2013=temp[temp['date'].dt.year==2013].reset_index(drop=True)
temp_2014=temp[temp['date'].dt.year==2014].reset_index(drop=True)
temp_2015=temp[temp['date'].dt.year==2015].reset_index(drop=True)
temp_2016=temp[temp['date'].dt.year==2016].reset_index(drop=True)
temp_2017=temp[temp['date'].dt.year==2017].reset_index(drop=True)

sns.set(style="whitegrid", color_codes=True)
plt.figure(figsize=(15,14))
plt.subplot(211)
plt.plot(temp_2013['date'],temp_2013.iloc[:,1],label="2013")
plt.plot(temp_2014['date'],temp_2014.iloc[:,1],label="2014")
plt.plot(temp_2015['date'],temp_2015.iloc[:,1],label="2015")
plt.plot(temp_2016['date'],temp_2016.iloc[:,1],label="2016")
plt.plot(temp_2017['date'],temp_2017.iloc[:,1],label="2017")
plt.ylabel('Number of stores open', fontsize=12)
plt.xlabel('Time', fontsize=12)
plt.title('Number of stores open', fontsize=15)
plt.xticks(rotation='vertical')
plt.legend(['2013', '2014', '2015', '2016'], loc='lower right')

plt.subplot(212)
plt.plot(temp_2013.index,temp_2013.iloc[:,1],label="2013")
plt.plot(temp_2014.index,temp_2014.iloc[:,1],label="2014")
plt.plot(temp_2015.index,temp_2015.iloc[:,1],label="2015")
plt.plot(temp_2016.index,temp_2016.iloc[:,1],label="2016")
plt.plot(temp_2017.index,temp_2017.iloc[:,1],label="2017")


plt.ylabel('Number of stores open', fontsize=12)
plt.xlabel('Day of year', fontsize=12)
plt.title('Number of stores open', fontsize=15)
plt.xticks(rotation='vertical')
plt.legend(['2013', '2014', '2015', '2016'], loc='lower right')
plt.show()

There seems to be certain local holidays where some of the stores are closed. But there is no consistent pattern of holidays where stores are closed

### Store #47

The chain established itselft in Quito in 1952 (We knew this from Wikipedia), so let's pick a shop in Quito as a starting point, as the brand is well established there. Let us pick #47 and plot the corresponding transactions time series. With a well established store, we can predict that the time series will be almost stationary. High seasonality is expected too, as people consume more during celebration periods.

In [None]:
ts=transactions.loc[transactions['store_nbr']==47,['date','transactions']].set_index('date')
ts=ts.transactions.astype('float')
plt.figure(figsize=(12,12))
plt.title('Daily transactions in store #47')
plt.xlabel('time')
plt.ylabel('Number of transactions')
plt.plot(ts);

## Items data

In [None]:
items.head()

In [None]:
x, y = (list(x) for x in zip(*sorted(zip(items.family.value_counts().index, 
                                         items.family.value_counts().values), 
                                        reverse = False)))
trace2 = go.Bar(
    y=items.family.value_counts().values,
    x=items.family.value_counts().index,
    marker=dict(
        color=items.family.value_counts().values,
        colorscale = 'Portland',
        reversescale = False
    ),
    orientation='v',
)

layout = dict(
    title='Counts of items per family category',
     width = 800, height = 800,
    yaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=True,
    ))

fig1 = go.Figure(data=[trace2])
fig1['layout'].update(layout)
py.iplot(fig1, filename='plots')

As we can see from the plot, the top 3 family categories are the GROCERY I, BEVERAGES and CLEANING categories.

In [None]:
x, y = (list(x) for x in zip(*sorted(zip(items['class'].value_counts().index, 
                                         items['class'].value_counts().values), 
                                        reverse = False)))
trace2 = go.Bar(
    x=items['class'].value_counts().index,
    y=items['class'].value_counts().values,
    marker=dict(
        color=items['class'].value_counts().values,
        colorscale = 'Jet',
        reversescale = True
    ),
    orientation='v',
)

layout = dict(
    title='Number of items attributed to a particular item class',
     width = 800, height = 1400,
    yaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=True
    ))

fig1 = go.Figure(data=[trace2])
fig1['layout'].update(layout)
py.iplot(fig1, filename='plots')

In [None]:
plt.style.use('seaborn-white')
fam_perishable = items.groupby(['family', 'perishable']).size()
fam_perishable.unstack().plot(kind='bar',stacked=True, colormap= 'coolwarm', figsize=(12,10),  grid=False)
plt.title('Stacked Barplot of locale name against event type')
plt.ylabel('Count of entries')
plt.show()

## Training Data

In [None]:
train.head()

In [None]:
plt.style.use('seaborn-deep')
plt.figure(figsize=(13,11))
plt.plot(train.date.values, train.unit_sales)
plt.ylim(-50, 10000)
plt.ylabel('transactions per day')
plt.xlabel('Date')
plt.show()

# Further Analysis

Further to make EDA easier, we used data which rolled up the sales to different levels

 - Day-Store level
 - Day-Item level
 - Store level
 - Item level
 - Day level

In [None]:
sale_day_item_level= pd.read_csv("../input/memory-optimization-data-manipulation/sale_day_item_level.csv")
sale_day_store_level= pd.read_csv("../input/memory-optimization-data-manipulation/sale_day_store_level.csv")
sale_store_item_level= pd.read_csv("../input/memory-optimization-data-manipulation/sale_store_item_level.csv")

In [None]:
#Creating store level metrics
sale_store_level=sale_day_store_level.groupby(['store_nbr'],as_index=False)['store_sales','item_variety'].agg(['sum'])

# Here the group by gives a multiindex , removing that
sale_store_level.columns = sale_store_level.columns.droplevel(1)
sale_store_level=sale_store_level.reset_index()
sale_store_level.head()

In [None]:
#Creating item level metrics
sale_item_level=sale_day_item_level.groupby(['item_nbr'],as_index=False)['item_sales'].agg(['sum'])

sale_item_level=sale_item_level.reset_index()
sale_item_level.head()

In [None]:
# Sorting by sales
temp=sale_store_level.sort_values('store_sales',ascending=False).reset_index(drop=True)
temp=temp.set_index('store_nbr').head(10)

plt.figure(figsize=(12,8))
sns.barplot(temp.index,temp.store_sales, alpha=0.6, color='blue')
plt.ylabel('Overall Sales', fontsize=12)
plt.xlabel('Store Number', fontsize=12)
plt.title('Top Stores by Overall sale', fontsize=15)
# plt.xticks(rotation='vertical')
plt.show()

In [None]:
# Sorting by sales
temp1=sale_item_level.sort_values('sum',ascending=False).reset_index(drop=True)
temp1=temp1.set_index('item_nbr').head(10)
plt.figure(figsize=(12,8))
x=temp1.index.values
y=temp1['sum'].values
sns.barplot(x,y, alpha=0.6, color='purple')
plt.ylabel('Overall Sales', fontsize=12)
plt.xlabel('Store Number', fontsize=12)
plt.title('Top Items by Overall sale', fontsize=15)
plt.show()

In [None]:
#YOY sales
temp=sale_day_store_level.groupby('Year')['store_sales'].sum()
plt.figure(figsize=(13,4))
sns.pointplot(temp.index,temp.values, alpha=0.8)
plt.ylabel('Overall Sales', fontsize=12)
plt.xlabel('Year', fontsize=12)
plt.title('Sale Year Over Year', fontsize=15)
plt.xticks(rotation='vertical')

plt.show()

In [None]:
# month over month sales
temp=sale_day_store_level.groupby(['Year','Month']).aggregate({'store_sales':np.sum,'Year':np.min,'Month':np.min})
temp=temp.reset_index(drop=True)
sns.set(style="whitegrid", color_codes=True)
# temp
plt.figure(figsize=(15,8))
plt.plot(range(1,13),temp.iloc[0:12,0],label="2013")
plt.plot(range(1,13),temp.iloc[12:24,0],label="2014")
plt.plot(range(1,13),temp.iloc[24:36,0],label="2015")
plt.plot(range(1,13),temp.iloc[36:48,0],label="2015")
plt.ylabel('Overall Sales', fontsize=12)
plt.xlabel('Month', fontsize=12)
plt.title('Monthly sales variation', fontsize=15)
plt.xticks(rotation='vertical')
plt.legend(['2013', '2014', '2015', '2016'], loc='upper left')
plt.show()

### Store Distrubution

In [None]:
#Count of stores in different types and clusters
plt.figure(figsize=(15,12))
#row col plotnumber - 121
plt.subplot(221)
# Count of stores for each type 
temp = stores['cluster'].value_counts()
#plot
sns.barplot(temp.index,temp.values,color=color[5])
plt.ylabel('Count of stores', fontsize=12)
plt.xlabel('Cluster', fontsize=12)
plt.title('Store distribution across cluster', fontsize=15)

plt.subplot(222)
# Count of stores for each type 
temp = stores['type'].value_counts()
#plot
sns.barplot(temp.index,temp.values,color=color[7])
plt.ylabel('Count of stores', fontsize=12)
plt.xlabel('Type of store', fontsize=12)
plt.title('Store distribution across store types', fontsize=15)

plt.subplot(223)
# Count of stores for each type 
temp = stores['state'].value_counts()
#plot
sns.barplot(temp.index,temp.values,color=color[8])
plt.ylabel('Count of stores', fontsize=12)
plt.xlabel('state', fontsize=12)
plt.title('Store distribution across states', fontsize=15)
plt.xticks(rotation='vertical')

plt.subplot(224)
# Count of stores for each type 
temp = stores['city'].value_counts()
#plot
sns.barplot(temp.index,temp.values,color=color[9])
plt.ylabel('Count of stores', fontsize=12)
plt.xlabel('City', fontsize=12)
plt.title('Store distribution across cities', fontsize=15)
plt.xticks(rotation='vertical')
plt.show()

### Sale distribution

In [None]:
sale_store_level=sale_store_level.iloc[:,0:2]
#print(sale_store_level)
merge=pd.merge(sale_store_level,stores,how='left',on='store_nbr')
#temp

#Sale of stores in different types and clusters
plt.figure(figsize=(15,12))
#row col plotnumber - 121
plt.subplot(221)
# Sale of stores for each type 
temp = merge.groupby(['cluster'])['store_sales'].sum()
#plot
sns.barplot(temp.index,temp.values,color=color[5])
plt.ylabel('Sales', fontsize=12)
plt.xlabel('Cluster', fontsize=12)
plt.title('Cumulative sales across store clusters', fontsize=15)

plt.subplot(222)
# sale of stores for each type 
temp = merge.groupby(['type'])['store_sales'].sum()
#plot
sns.barplot(temp.index,temp.values,color=color[7])
plt.ylabel('sales', fontsize=12)
plt.xlabel('Type of store', fontsize=12)
plt.title('Cumulative sales across store types', fontsize=15)

plt.subplot(223)
# sale of stores for each type 
temp = merge.groupby(['state'])['store_sales'].sum()
#plot
sns.barplot(temp.index,temp.values,color=color[8])
plt.ylabel('sales', fontsize=12)
plt.xlabel('state', fontsize=12)
plt.title('Cumulative sales across states', fontsize=15)
plt.xticks(rotation='vertical')

plt.subplot(224)
# sale of stores for city
temp = merge.groupby(['city'])['store_sales'].sum()
#plot
sns.barplot(temp.index,temp.values,color=color[9])
plt.ylabel('sales', fontsize=12)
plt.xlabel('City', fontsize=12)
plt.title('Cumulative sales across cities', fontsize=15)
plt.xticks(rotation='vertical')
plt.show()

### Sale variation

In [None]:
sale_store_level=sale_store_level.iloc[:,0:2]
merge=pd.merge(sale_store_level,stores,how='left',on='store_nbr')

plt.figure(figsize=(15,12))
#row col plotnumber - 121
plt.subplot(221)
#plot
sns.boxplot(x='cluster', y="store_sales", data=merge)
plt.ylabel('Sales', fontsize=12)
plt.xlabel('Cluster', fontsize=12)
plt.title('Variation across store clusters', fontsize=15)

plt.subplot(222)
# sale of stores for each type 
sns.boxplot(x='type', y="store_sales", data=merge)
plt.ylabel('sales', fontsize=12)
plt.xlabel('Type of store', fontsize=12)
plt.title('Variation across store types', fontsize=15)

plt.subplot(223)
# sale of stores for each type 
sns.boxplot(x='state', y="store_sales", data=merge)
plt.ylabel('sales', fontsize=12)
plt.xlabel('state', fontsize=12)
plt.title('Variation across states', fontsize=15)
plt.xticks(rotation='vertical')

plt.subplot(224)
# sale of stores for city
sns.boxplot(x='city', y="store_sales", data=merge)
plt.ylabel('sales', fontsize=12)
plt.xlabel('City', fontsize=12)
plt.title('Variation across cities', fontsize=15)
plt.xticks(rotation='vertical')
plt.show()

In [None]:
store_items=pd.merge(sale_store_item_level,items,on='item_nbr')
store_items=pd.merge(store_items,stores,on='store_nbr')
store_items['item_sales']=store_items['item_sales']

#item
# top selling items by store type
top_items_by_type=store_items.groupby(['type','item_nbr'])['item_sales'].sum()
top_items_by_type=top_items_by_type.reset_index().sort_values(['type','item_sales'],ascending=[True,False])

#get top 5
top_items_by_type=top_items_by_type.groupby(['type']).head(5)


#class
# top selling item class by store type
top_class_by_type=store_items.groupby(['type','class'])['item_sales'].sum()
top_class_by_type=top_class_by_type.reset_index().sort_values(['type','item_sales'],ascending=[True,False])

#get top 5
top_class_by_type=top_class_by_type.groupby(['type']).head(5)


#family
# top selling item family by store type
top_family_by_type=store_items.groupby(['type','family'])['item_sales'].sum()
top_family_by_type=top_family_by_type.reset_index().sort_values(['type','item_sales'],ascending=[True,False])

#get top 5
top_family_by_type=top_family_by_type.groupby(['type']).head(5)

In [None]:
top_family_by_type=store_items.groupby(['type','family'])['item_sales'].sum()
top_family_by_type=top_family_by_type.reset_index().sort_values(['type','item_sales'],ascending=[True,False])
x=top_family_by_type.pivot(index='family',columns='type')
cm = sns.light_palette("orange", as_cmap=True)
x = x.style.background_gradient(cmap=cm)
x

The distribution of sale across the store types for the top 30 items have been shown below
The darker the color gradient the more the store type has contributed to the sale of items in that class


In [None]:
top_items_by_type=store_items.groupby(['type','item_nbr'])['item_sales'].sum()
top_items_by_type=top_items_by_type.reset_index().sort_values(['type','item_sales'],ascending=[True,False])
top_items_by_type=top_items_by_type.groupby(['item_nbr']).head(20)
#print(top_items_by_type)
x=top_items_by_type.pivot(index='item_nbr',columns='type')
x['total']=x.sum(axis=1)
x=x.sort_values('total',ascending=False)
del(x['total'])
x=x.head(30)
cm = sns.light_palette("green", as_cmap=True)
x = x.style.background_gradient(cmap=cm,axis=1)
x

# feature engineering

**Here we analyze the data and select the features for our model to be trained on.**

**Train**
id, date, store_nbr, item_nbr, unit_scale, on_promotion

**Items**
item_nbr, family, class, perishable

**Holidays_events**
date, type, locale, locale_name, description, transferred

**Stores**
store_nbr, city, state, type, cluster

**Oil**
date, dcoilwtico

**Transactions**
date, store_nbr, transactions

**Selected features as inputs to the model**

date, holiday.type, holidaye.locale, holiday.locale_name, holiday_transfered, store_nbr, store.city, store.state, store.type, store.cluster, transactions, item_nbr, item.family, item.class, on_promotion, perishable, dcoilwtico.

**Selected features as outputs of the model**

transactions per store, unit_sales per item

# DATA pipeline

### takes files in, outputs a complete dataframe

In [None]:
import datetime as dt
from sklearn.base import BaseEstimator, TransformerMixin

class prepare_data(BaseEstimator, TransformerMixin):
    def __init__(self):
        print("prepare_data -> init")
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        train_stores = X[0].merge(X[1], right_on = 'store_nbr', left_on='store_nbr')
        train_stores_oil = train_stores.merge(X[2], right_on='date', left_on='date')
        train_stores_oil_items = train_stores_oil.merge(X[3], right_on = 'item_nbr', left_on = 'item_nbr')
        train_stores_oil_items_transactions = train_stores_oil_items.merge(X[4], right_on = ['date', 'store_nbr'], left_on = ['date', 'store_nbr'])
        train_stores_oil_items_transactions_hol = train_stores_oil_items_transactions.merge(X[5], right_on = 'date', left_on = 'date')
        
        data_df = train_stores_oil_items_transactions_hol.copy(deep = True)
        
        # change the bool to int
        data_df['onpromotion'] = data_df['onpromotion'].astype(int)
        data_df['transferred'] = data_df['transferred'].astype(int)

        # change the names
        data_df.rename(columns={'type_x': 'st_type', 'type_y': 'hol_type'}, inplace=True)

        # drop the id
        data_df.drop(['id'], axis=1, inplace=True)
        
        print(data_df.head())
        
        # handle date
        data_df['date'] = pd.to_datetime(data_df['date'])
        data_df['date'] = data_df['date'].map(dt.datetime.toordinal)
                
        return data_df

### Custom transform for splitting the data

In [None]:
# split dataframe into numerical values, categorical values and date
class split_data(BaseEstimator, TransformerMixin):
    def __init__(self):
        print("split_data -> init")
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        # Get columns for each type         
        df_ = X.drop(['date'], axis = 1)
        cols = df_.columns
        num_cols = df_._get_numeric_data().columns
        cat_cols = list(set(cols) - set(num_cols))
        
        data_num_df = X[num_cols]
        data_cat_df = X[cat_cols]
        data_date_df = X['date']
        
        return data_num_df, data_cat_df, data_date_df

### Custom transform

1. Fill missing data in numerical attributes
2. apply standard scaler to numerical attributes
3. Convert categorical data into numerical

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

class process_data(BaseEstimator, TransformerMixin):
    def __init__(self):
        print("process_data -> init")
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        ### numerical data
        # impute nulls in numerical attributes
        imputer = SimpleImputer(strategy="mean", copy="true")
        num_imp = imputer.fit_transform(X[0])
        data_num_df = pd.DataFrame(num_imp, columns=X[0].columns, index=X[0].index)
        
        # apply standard scaling
        scaler = StandardScaler()
        scaler.fit(data_num_df)
        num_scaled = scaler.transform(data_num_df)
        data_num_df = pd.DataFrame(num_scaled, columns=X[0].columns, index=X[0].index)
        
        ### categorical data
        # one hot encoder
        cat_encoder = OneHotEncoder(sparse=False)
        data_cat_1hot = cat_encoder.fit_transform(X[1])
        
        # convert it to datafram with n*99 where n number of rows and 99 is no. of categories
        data_cat_df = pd.DataFrame(data_cat_1hot, columns=cat_encoder.get_feature_names()) #, index=X[1].index)
                
        return data_num_df, data_cat_df, X[2]

### yet another transform

In [None]:
class join_df(BaseEstimator, TransformerMixin):
    def __init__(self):
        print("join_df -> init")
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        ### numerical data
        data_df = X[0].join(X[1])
        data_df = data_df.join(X[2])
        
        return data_df

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

pipe_processing = Pipeline([
        ('prepare_data', prepare_data()),
        ('split_data', split_data()),
        ('process_data', process_data()),
        ('join_data', join_df())
    ])

# our prepared data
data_df = pipe_processing.fit_transform([train_large, stores, oil, items, transactions, holiday_events])

# split it according to our feature engineering
X = data_df.drop(['unit_sales', 'transactions'], axis=1)
Y = data_df[['unit_sales', 'transactions']]

### Generate test and training data


In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

# Modelling and testing

### Generic function for modelling and testing

In [None]:
from sklearn.linear_model import LinearRegression,SGDRegressor,ElasticNet,Ridge
from sklearn.svm import SVC
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error,mean_absolute_error


def checkModelPerformane(model):
    model.fit(x_train.values, y_train.values)
    
    pred = model.predict(x_test.values)
    
    print("mean_squared_error: ",np.sqrt(mean_squared_error(y_test.values, pred))) 
    print("mean_absolute_error: ", np.sqrt(mean_absolute_error(y_test.values, pred)))

### Linear regression

In [None]:
print("LinearRegression")
checkModelPerformane(LinearRegression())

### Lasso regression

In [None]:
print("lasso regression ")
checkModelPerformane(linear_model.Lasso(alpha=0.1))

### ElasticNet regression

In [None]:
print("ElasticNet regression ")
checkModelPerformane(ElasticNet())

### Ridge regression

In [None]:
print("Ridge regression ")
checkModelPerformane(Ridge(alpha=1.0))

### Random forests

In [None]:
print("Random Forest")
checkModelPerformane(RandomForestRegressor(random_state=42)) 

Random forest model has the lowest error, thus we are going to use it and fine tune it.

# Model fine tuning

### Grid search

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},]

forest_reg = RandomForestRegressor(random_state=42)
 
grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(x_train.values, y_train.values)

### find out the best parameters for our model

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_

# Test model on test set

In [None]:
!head test.csv

In [None]:
final_model = grid_search.best_estimator_

# load and process data
test = pd.read_csv("../working/test.csv", parse_dates=['date'])

pipe_processing2 = Pipeline([
        ('split_data', split_data()),
        ('process_data', process_data()),
        ('join_data', join_df())
    ])

test_df = pipe_processing2.fit_transform(test)



In [None]:
test_df


In [None]:
# final_predictions = final_model.predict(test_x)