# 2.7 Extra Challenge (done)

In [1]:
# to make the .py script runnable
#!/usr/bin/env python

In [2]:
from sklearn import datasets
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline
plt.style.use('ggplot')

In [3]:
import os

## 2.7.1 Data Context

A retailer has approached you because he is unable to get a grip on the sales forecasting of the stores. He believes he has good data quality but he does not know how to leverage this data in order to create a better forecast. He would like both short and long term forecasts. The following information has been provided by the retailer: 

store_info.csv

Information on the type and size of the various stores

historic_sales.csv

* Store - the store number
* Dept - the department number
* Date - the week
* Weekly_Sales -  sales for the given department in the given store
* IsHoliday - whether the week is a special holiday week

features.csv

* Store - the store number
* Date - the week
* Temperature - average temperature in the region
* Fuel_Price - cost of fuel in the region
* MarkDown1-5 - anonymized data related to promotional markdowns
* CPI - the consumer price index
* Unemployment - the unemployment rate
* IsHoliday - whether the week is a special holiday week

## 2.7.2 Your task

1. Import and explore the data 
2. Do your basic sanity checks - there might be some issues...
3. If data is missing or incorrect, impute something
4. Merge the two data tables
5. Calculate the summary statistics for this data

In [5]:
#df_historic = pd.read_csv("data/historic_sales.csv", sep="|")
df_historic = pd.read_csv("data/historic_sales.csv", sep="|", index_col="Unnamed: 0")

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
df_historic.shape

(421570, 5)

In [30]:
df_historic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 421570 entries, 0 to 421569
Data columns (total 5 columns):
Store           421570 non-null int64
Dept            421570 non-null object
Date            421570 non-null object
Weekly_Sales    418805 non-null object
IsHoliday       421569 non-null object
dtypes: int64(1), object(4)
memory usage: 19.3+ MB


In [31]:
df_historic.columns

Index(['Store', 'Dept', 'Date', 'Weekly_Sales', 'IsHoliday'], dtype='object')

In [22]:
df_historic.describe()

Unnamed: 0.1,Unnamed: 0,Store
count,421570.0,421570.0
mean,210784.5,22.200546
std,121696.920829,12.785297
min,0.0,1.0
25%,105392.25,11.0
50%,210784.5,22.0
75%,316176.75,33.0
max,421569.0,45.0


In [32]:
df_historic.index

Int64Index([     0,      1,      2,      3,      4,      5,      6,      7,
                 8,      9,
            ...
            421560, 421561, 421562, 421563, 421564, 421565, 421566, 421567,
            421568, 421569],
           dtype='int64', length=421570)

In [33]:
df_historic.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
0,1,1,2010-02-05,249245,False
1,1,1,2010-02-12,4603949,True
2,1,1,2010-02-19,4159555,False
3,1,1,2010-02-26,1940354,False
4,1,1,2010-03-05,218279,False


In [34]:
df_historic.tail()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
421565,45,98,2012-09-28,50837,False
421566,45,98,2012-10-05,6281,False
421567,45,98,2012-10-12,106102,False
421568,45,98,2012-10-19,76001,False
421569,45,98,2012-10-26,10768,False


In [35]:
df_historic.sample()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
241486,25,33,2011-12-23,542259,False


In [38]:
#df_features = pd.read_csv("data/features.csv", sep="|")
df_features = pd.read_csv("data/features.csv", sep="|", index_col="Unnamed: 0")

In [39]:
df_features.columns

Index(['Store', 'Date', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2',
       'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment',
       'IsHoliday'],
      dtype='object')

In [40]:
df_features.shape

(8190, 12)

In [41]:
df_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8190 entries, 0 to 8189
Data columns (total 12 columns):
Store           8190 non-null int64
Date            8190 non-null object
Temperature     8190 non-null float64
Fuel_Price      8190 non-null float64
MarkDown1       4032 non-null float64
MarkDown2       2921 non-null float64
MarkDown3       3613 non-null float64
MarkDown4       3464 non-null float64
MarkDown5       4050 non-null float64
CPI             7605 non-null float64
Unemployment    7605 non-null float64
IsHoliday       8190 non-null bool
dtypes: bool(1), float64(9), int64(1), object(1)
memory usage: 775.8+ KB


In [42]:
df_features.describe()

Unnamed: 0,Store,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment
count,8190.0,8190.0,8190.0,4032.0,2921.0,3613.0,3464.0,4050.0,7605.0,7605.0
mean,23.0,59.356198,3.405992,7032.371786,3384.176594,1760.10018,3292.935886,4132.216422,172.460809,7.826821
std,12.987966,18.678607,0.431337,9262.747448,8793.583016,11276.462208,6792.329861,13086.690278,39.738346,1.877259
min,1.0,-7.29,2.472,-2781.45,-265.76,-179.26,0.22,-185.17,126.064,3.684
25%,12.0,45.9025,3.041,1577.5325,68.88,6.6,304.6875,1440.8275,132.364839,6.634
50%,23.0,60.71,3.513,4743.58,364.57,36.26,1176.425,2727.135,182.764003,7.806
75%,34.0,73.88,3.743,8923.31,2153.35,163.15,3310.0075,4832.555,213.932412,8.567
max,45.0,101.95,4.468,103184.98,104519.54,149483.31,67474.85,771448.1,228.976456,14.313
